From 92e5ce2a65c94752caa07eaa4880c3f3832ac2de Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 22 Aug 2023 18:50:32 -0300
Subject: [PATCH 001/252] Copied deformable detr

---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/model_doc/grounding-dino.md    |   48 +
 src/transformers/__init__.py                  |   16 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    2 +
 .../models/grounding_dino/__init__.py         |   57 +
 .../configuration_grounding_dino.py           |  262 ++
 .../convert_grounding_dino_to_pytorch.py      |  237 ++
 .../models/grounding_dino/load_custom.py      |   49 +
 .../grounding_dino/modeling_grounding_dino.py | 2513 +++++++++++++++++
 tests/models/grounding_dino/__init__.py       |    0
 .../test_modeling_grounding_dino.py           |  673 +++++
 15 files changed, 3865 insertions(+)
 create mode 100644 docs/source/en/model_doc/grounding-dino.md
 create mode 100644 src/transformers/models/grounding_dino/__init__.py
 create mode 100644 src/transformers/models/grounding_dino/configuration_grounding_dino.py
 create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
 create mode 100644 src/transformers/models/grounding_dino/load_custom.py
 create mode 100644 src/transformers/models/grounding_dino/modeling_grounding_dino.py
 create mode 100644 tests/models/grounding_dino/__init__.py
 create mode 100644 tests/models/grounding_dino/test_modeling_grounding_dino.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d054faf0712fe7..0b5e0434e7bb2f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -506,6 +506,8 @@
         title: FocalNet
       - local: model_doc/glpn
         title: GLPN
+      - local: model_doc/grounding-dino
+        title: Grounding DINO
       - local: model_doc/imagegpt
         title: ImageGPT
       - local: model_doc/levit
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
new file mode 100644
index 00000000000000..161a90609174b3
--- /dev/null
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -0,0 +1,48 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Grounding DINO
+
+## Overview
+
+The Grounding DINO model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## GroundingDINOConfig
+
+[[autodoc]] GroundingDINOConfig
+
+## GroundingDINOModel
+
+[[autodoc]] GroundingDINOModel
+    - forward
+
+## GroundingDINOForObjectDetection
+
+[[autodoc]] GroundingDINOForObjectDetection
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9b95aadffccc6f..aa2f7837b4ce67 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -274,6 +274,7 @@
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
@@ -1541,6 +1542,14 @@
             "DeformableDetrPreTrainedModel",
         ]
     )
+    _import_structure["models.grounding_dino"].extend(
+        [
+            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GroundingDINOForObjectDetection",
+            "GroundingDINOModel",
+            "GroundingDINOPreTrainedModel",
+        ]
+    )
     _import_structure["models.deit"].extend(
         [
             "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4329,6 +4338,7 @@
         DecisionTransformerConfig,
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.deprecated.mctct import (
         MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5435,6 +5445,12 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
+        from .models.grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
         from .models.deit import (
             DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeiTForImageClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 8f60447e7319f9..376f9353608e56 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -59,6 +59,7 @@
     deberta_v2,
     decision_transformer,
     deformable_detr,
+    grounding_dino,
     deit,
     deprecated,
     deta,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a345235951d48c..db5e5f86761b88 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -71,6 +71,7 @@
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("deformable_detr", "DeformableDetrConfig"),
+        ("grounding-dino", "GroundingDINOConfig"),
         ("deit", "DeiTConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
@@ -277,6 +278,7 @@
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -474,6 +476,7 @@
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
         ("deformable_detr", "Deformable DETR"),
+        ("grounding-dino", "Grounding DINO"),
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
         ("deta", "DETA"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 90ece37c657191..78a0686c4816b0 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -50,6 +50,7 @@
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
         ("data2vec-vision", "BeitFeatureExtractor"),
         ("deformable_detr", "DeformableDetrFeatureExtractor"),
+        ("grounding-dino", "GroundingDINOFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("dinat", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 37ccc829de1ba5..ec8bf20938fd7a 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -53,6 +53,7 @@
         ("cvt", "ConvNextImageProcessor"),
         ("data2vec-vision", "BeitImageProcessor"),
         ("deformable_detr", "DeformableDetrImageProcessor"),
+        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("deit", "DeiTImageProcessor"),
         ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8be38bb3f8d577..2c54349e8306b2 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -69,6 +69,7 @@
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
+        ("grounding-dino", "GroundingDINOModel"),
         ("deit", "DeiTModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
@@ -619,6 +620,7 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
new file mode 100644
index 00000000000000..e3767e017d1023
--- /dev/null
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_grounding_dino"] = [
+        "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GroundingDINOForObjectDetection",
+        "GroundingDINOModel",
+        "GroundingDINOPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
new file mode 100644
index 00000000000000..0b3ae3d74d3475
--- /dev/null
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Grounding DINO model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
+}
+
+
+
+class GroundingDINOConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate
+    a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Grounding DINO
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
+            `two_stage_num_proposals` instead.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        num_feature_levels (`int`, *optional*, defaults to 4):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `False`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `False`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GroundingDINOConfig, GroundingDINOModel
+
+    >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration
+    >>> configuration = GroundingDINOConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = GroundingDINOModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "grounding-dino"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=1024,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        return_intermediate=True,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        use_pretrained_backbone=True,
+        dilation=False,
+        num_feature_levels=4,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=False,
+        two_stage_num_proposals=300,
+        with_box_refine=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        disable_custom_kernels=False,
+        **kwargs,
+    ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.dilation = dilation
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
new file mode 100644
index 00000000000000..d3cef0366b2bca
--- /dev/null
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Grounding DINO checkpoints."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import cached_download, hf_hub_url
+from PIL import Image
+
+from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def rename_key(orig_key):
+    if "backbone.0.body" in orig_key:
+        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
+    if "transformer" in orig_key:
+        orig_key = orig_key.replace("transformer.", "")
+    if "norm1" in orig_key:
+        if "encoder" in orig_key:
+            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
+        else:
+            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
+    if "norm2" in orig_key:
+        if "encoder" in orig_key:
+            orig_key = orig_key.replace("norm2", "final_layer_norm")
+        else:
+            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
+    if "norm3" in orig_key:
+        orig_key = orig_key.replace("norm3", "final_layer_norm")
+    if "linear1" in orig_key:
+        orig_key = orig_key.replace("linear1", "fc1")
+    if "linear2" in orig_key:
+        orig_key = orig_key.replace("linear2", "fc2")
+    if "query_embed" in orig_key:
+        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
+    if "cross_attn" in orig_key:
+        orig_key = orig_key.replace("cross_attn", "encoder_attn")
+
+    return orig_key
+
+
+def read_in_q_k_v(state_dict):
+    # transformer decoder self-attention layers
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_grounding_dino_checkpoint(
+    checkpoint_path,
+    single_scale,
+    dilation,
+    with_box_refine,
+    two_stage,
+    pytorch_dump_folder_path,
+    push_to_hub,
+):
+    """
+    Copy/paste/tweak model's weights to our Grounding DINO structure.
+    """
+
+    # load default config
+    config = GroundingDINOConfig()
+    # set config attributes
+    if single_scale:
+        config.num_feature_levels = 1
+    config.dilation = dilation
+    config.with_box_refine = with_box_refine
+    config.two_stage = two_stage
+    # set labels
+    config.num_labels = 91
+    repo_id = "huggingface/label-files"
+    filename = "coco-detection-id2label.json"
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    # load image processor
+    image_processor = DeformableDetrImageProcessor(format="coco_detection")
+
+    # prepare image
+    img = prepare_img()
+    encoding = image_processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info("Converting model...")
+
+    # load original state dict
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "model."
+    for key in state_dict.copy().keys():
+        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
+            val = state_dict.pop(key)
+            state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = GroundingDINOForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    # verify our conversion
+    outputs = model(pixel_values.to(device))
+
+    expected_logits = torch.tensor(
+        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
+    )
+    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
+
+    if single_scale:
+        expected_logits = torch.tensor(
+            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+        )
+        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
+
+    if single_scale and dilation:
+        expected_logits = torch.tensor(
+            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
+        )
+        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
+
+    if with_box_refine:
+        expected_logits = torch.tensor(
+            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
+        )
+        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
+
+    if with_box_refine and two_stage:
+        expected_logits = torch.tensor(
+            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
+        )
+        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
+
+    print("Logits:", outputs.logits[0, :3, :3])
+
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
+
+    print("Everything ok!")
+
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    # Push to hub
+    if push_to_hub:
+        model_name = "deformable-detr"
+        model_name += "-single-scale" if single_scale else ""
+        model_name += "-dc5" if dilation else ""
+        model_name += "-with-box-refine" if with_box_refine else ""
+        model_name += "-two-stage" if two_stage else ""
+        print("Pushing model to hub...")
+        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth",
+        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
+    )
+    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
+    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
+    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
+    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to output PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_grounding_dino_checkpoint(
+        args.checkpoint_path,
+        args.single_scale,
+        args.dilation,
+        args.with_box_refine,
+        args.two_stage,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py
new file mode 100644
index 00000000000000..97b8f09fb5f446
--- /dev/null
+++ b/src/transformers/models/grounding_dino/load_custom.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Loading of Grounding DINO's CUDA kernels"""
+import os
+from pathlib import Path
+
+
+def load_cuda_kernels():
+    from torch.utils.cpp_extension import load
+
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
+    src_files = [
+        root / filename
+        for filename in [
+            "vision.cpp",
+            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+        ]
+    ]
+
+    load(
+        "MultiScaleDeformableAttention",
+        src_files,
+        with_cuda=True,
+        extra_include_paths=[str(root)],
+        extra_cflags=["-DWITH_CUDA=1"],
+        extra_cuda_cflags=[
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ],
+    )
+
+    import MultiScaleDeformableAttention as MSDA
+
+    return MSDA
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
new file mode 100644
index 00000000000000..ee80a562e4b851
--- /dev/null
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -0,0 +1,2513 @@
+# coding=utf-8
+# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Grounding DINO model."""
+
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_torch_cuda_available,
+    is_vision_available,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import is_ninja_available, logging
+from ..auto import AutoBackbone
+from .configuration_grounding_dino import GroundingDINOConfig
+from .load_custom import load_cuda_kernels
+
+
+logger = logging.get_logger(__name__)
+
+# Move this to not compile only when importing, this needs to happen later, like in __init__.
+if is_torch_cuda_available() and is_ninja_available():
+    logger.info("Loading custom CUDA kernels...")
+    try:
+        MultiScaleDeformableAttention = load_cuda_kernels()
+    except Exception as e:
+        logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        MultiScaleDeformableAttention = None
+else:
+    MultiScaleDeformableAttention = None
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+
+class MultiScaleDeformableAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        context,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        context.im2col_step = im2col_step
+        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            context.im2col_step,
+        )
+        context.save_for_backward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(context, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = context.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            context.im2col_step,
+        )
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GroundingDINOConfig"
+_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny"
+
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "idea-research/grg-dino-tiny",
+    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
+]
+
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO
+class GroundingDINODecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the GroundingDINODecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Grounding DINO encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    init_reference_points: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO
+class GroundingDINOObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`GroundingDINOForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional = None
+    enc_outputs_coord_logits: Optional = None
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO
+class GroundingDINOFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = GroundingDINOFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO
+class GroundingDINOConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO
+class GroundingDINOConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
+    """
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
+
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO
+class GroundingDINOSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
+class GroundingDINOLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Grounding DINO.
+    """
+
+    def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
+        super().__init__()
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        nn.init.constant_(self.attention_weights.weight.data, 0.0)
+        nn.init.constant_(self.attention_weights.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.value_proj.weight.data)
+        nn.init.constant_(self.value_proj.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.output_proj.weight.data)
+        nn.init.constant_(self.output_proj.bias.data, 0.0)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        if self.disable_custom_kernels:
+            # PyTorch implementation
+            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        else:
+            try:
+                # custom kernel
+                output = MultiScaleDeformableAttentionFunction.apply(
+                    value,
+                    spatial_shapes,
+                    level_start_index,
+                    sampling_locations,
+                    attention_weights,
+                    self.im2col_step,
+                )
+            except Exception:
+                # PyTorch implementation
+                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
+class GroundingDINOEncoderLayer(nn.Module):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = GroundingDINOMultiscaleDeformableAttention(
+            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
+class GroundingDINODecoderLayer(nn.Module):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = GroundingDINOMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = GroundingDINOMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
+class GroundingDINOClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO
+class GroundingDINOPreTrainedModel(PreTrainedModel):
+    config_class = GroundingDINOConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, GroundingDINOLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
+            module._reset_parameters()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GroundingDINODecoder):
+            module.gradient_checkpointing = value
+
+
+GROUNDING_DINO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GroundingDINOConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GROUNDING_DINO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+            for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO
+class GroundingDINOEncoder(GroundingDINOPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`GroundingDINOEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: GroundingDINOConfig
+    """
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINODecoder(GroundingDINOPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Grounding DINO:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: GroundingDINOConfig
+    """
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Grounding DINO
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return GroundingDINODecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
+class GroundingDINOModel(GroundingDINOPreTrainedModel):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = GroundingDINOConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        if not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+        self.encoder = GroundingDINOEncoder(config)
+        self.decoder = GroundingDINODecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
+
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, GroundingDINOModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(features):
+            sources.append(self.input_proj[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj[level](features[-1][0])
+                else:
+                    source = self.input_proj[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        batch_size, _, num_channels = encoder_outputs[0].shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation for two-stage Grounding DINO
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.config.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return GroundingDINOModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
+
+
+@add_start_docstrings(
+    """
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
+class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        # Grounding DINO encoder-decoder model
+        self.model = GroundingDINOModel(config)
+
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = GroundingDINOMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
+        Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
+        Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = GroundingDINOHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = GroundingDINOLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
+                outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = GroundingDINOObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+        )
+
+        return dict_outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.dice_loss
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO
+class GroundingDINOLoss(nn.Module):
+    """
+    This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we
+    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
+    matched ground-truth / prediction (supervise class and box).
+
+    Args:
+        matcher (`GroundingDINOHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDINOMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO
+class GroundingDINOHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# Copied from transformers.models.detr.modeling_detr._max_by_axis
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/tests/models/grounding_dino/__init__.py b/tests/models/grounding_dino/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
new file mode 100644
index 00000000000000..3007eef6399916
--- /dev/null
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -0,0 +1,673 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Grounding DINO model. """
+
+
+import inspect
+import math
+import unittest
+from typing import Dict, List, Tuple
+
+from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import (
+    require_timm,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import GroundingDINOForObjectDetection, GroundingDINOModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class GroundingDINOModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        image_size=196,
+        n_targets=8,
+        num_labels=91,
+        num_feature_levels=4,
+        encoder_n_points=2,
+        decoder_n_points=6,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = (
+            math.ceil(self.image_size / 8) ** 2
+            + math.ceil(self.image_size / 16) ** 2
+            + math.ceil(self.image_size / 32) ** 2
+            + math.ceil(self.image_size / 64) ** 2
+        )
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
+        return GroundingDINOConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            num_feature_levels=self.num_feature_levels,
+            encoder_n_points=self.encoder_n_points,
+            decoder_n_points=self.decoder_n_points,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels):
+        model = GroundingDINOModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
+
+    def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = GroundingDINOForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_torch
+class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = GroundingDINOModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False)
+
+    def test_config(self):
+        # we don't test common_properties and arguments_init as these don't apply for Grounding DINO
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+
+    def test_grounding_dino_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs)
+
+    def test_grounding_dino_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="Grounding DINO does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 8
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            # Object Detection model returns pred_logits and pred_boxes
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                correct_outlen += 2
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.decoder_n_points,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        # we take the second output since last_hidden_state is the second item
+        output = outputs[1]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if param.requires_grad:
+                        if (
+                            "level_embed" in name
+                            or "sampling_offsets.bias" in name
+                            or "value_proj" in name
+                            or "output_proj" in name
+                            or "reference_points" in name
+                        ):
+                            continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_two_stage_training(self):
+        model_class = GroundingDINOForObjectDetection
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        config.two_stage = True
+        config.auxiliary_loss = True
+        config.with_box_refine = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class GroundingDINOModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
+
+    def test_inference_object_detection_head(self):
+        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
+        expected_labels = [17, 17, 75, 75, 63]
+        expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device)
+
+        self.assertEqual(len(results["scores"]), 5)
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
+
+    def test_inference_object_detection_head_with_box_refine_two_stage(self):
+        model = GroundingDINOForObjectDetection.from_pretrained(
+            "SenseTime/deformable-detr-with-box-refine-two-stage"
+        ).to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+    @require_torch_gpu
+    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt")
+        pixel_values = encoding["pixel_values"]
+        pixel_mask = encoding["pixel_mask"]
+
+        # 1. run model on CPU
+        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale")
+
+        with torch.no_grad():
+            cpu_outputs = model(pixel_values, pixel_mask)
+
+        # 2. run model on GPU
+        model.to("cuda")
+
+        with torch.no_grad():
+            gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda"))
+
+        # 3. assert equivalence
+        for key in cpu_outputs.keys():
+            assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
+
+        expected_logits = torch.tensor(
+            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+        )
+        assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4)

From f6d87c104d941fb2f1b09960407623dd6fc45d04 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 23 Aug 2023 12:25:43 -0300
Subject: [PATCH 002/252] First commit

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.md                       |   2 +
 docs/source/en/tasks/object_detection.md      |   2 +-
 .../configuration_grounding_dino.py           |   6 +-
 .../convert_grounding_dino_to_hf.py           | 242 ++++++++++++++++++
 .../convert_grounding_dino_to_pytorch.py      | 237 -----------------
 .../grounding_dino/modeling_grounding_dino.py |   4 +-
 src/transformers/utils/dummy_pt_objects.py    |  24 ++
 14 files changed, 281 insertions(+), 243 deletions(-)
 create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
 delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py

diff --git a/README.md b/README.md
index 41fb758abe1500..d952ba96ddfa62 100644
--- a/README.md
+++ b/README.md
@@ -371,6 +371,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
diff --git a/README_es.md b/README_es.md
index 6a0701b09d3432..2f38327dcb84ca 100644
--- a/README_es.md
+++ b/README_es.md
@@ -348,6 +348,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
diff --git a/README_hd.md b/README_hd.md
index 8651678669a7e7..01dcffadaef0db 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -320,6 +320,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 8e497e94175a0a..c88e62e459d215 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -382,6 +382,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
diff --git a/README_ko.md b/README_ko.md
index 3f33e4b199d367..885494ef0a1abd 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -297,6 +297,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 135f9b35a9631f..c9d78e7d9887a3 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -321,6 +321,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 781c412ca2a1db..4c8ef2c098aa03 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -333,6 +333,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index e1c346971f386e..b6738dcf3a9b04 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -137,6 +137,7 @@ The documentation is organized into five sections:
 1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
@@ -359,6 +360,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          GPTBigCode           |       ✅        |         ❌         |      ❌      |
 |        GPTSAN-japanese        |       ✅        |         ❌         |      ❌      |
 |          Graphormer           |       ✅        |         ❌         |      ❌      |
+|        Grounding DINO         |       ✅        |         ❌         |      ❌      |
 |           GroupViT            |       ✅        |         ✅         |      ❌      |
 |            Hubert             |       ✅        |         ✅         |      ❌      |
 |            I-BERT             |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 563beb274253d5..4eab9e58fb27da 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0b3ae3d74d3475..23cd86fd3f9d44 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -22,7 +22,7 @@
 logger = logging.get_logger(__name__)
 
 GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
+    "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
 
@@ -151,8 +151,8 @@ class GroundingDINOConfig(PretrainedConfig):
 
     def __init__(
         self,
-        use_timm_backbone=True,
-        backbone_config=None,
+        use_timm_backbone=False,
+        backbone_config={"model_type": "swin"},
         num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
new file mode 100644
index 00000000000000..b5de1d8a652c0e
--- /dev/null
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert GroundingDINO SimMIM checkpoints from the original repository.
+
+URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
+
+import argparse
+
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms as T
+import torchvision.transforms.functional as F
+
+from transformers import (
+    GroundingDINOConfig, GroundingDINOForObjectDetection
+)
+
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+
+
+def get_grounding_dino_config(model_name):
+    config = GroundingDINOConfig()
+
+    if "tiny" in model_name:
+        window_size = 7
+        embed_dim = 96
+        depths = (2, 2, 6, 2)
+        num_heads = (3, 6, 12, 24)
+        image_size = 224
+    elif "base" in model_name:
+        window_size = 12
+        embed_dim = 128
+        depths = (2, 2, 18, 2)
+        num_heads = (4, 8, 16, 32)
+        image_size = 384
+    else:
+        raise ValueError("Model not supported, only supports base and large variants")
+
+    config.backbone_config.window_size = window_size
+    config.backbone_config.image_size = image_size
+    config.backbone_config.embed_dim = embed_dim
+    config.backbone_config.depths = depths
+    config.backbone_config.num_heads = num_heads
+    config.backbone_config.out_indices = [2, 3, 4]
+
+    return config
+
+
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+    #TODO names might change after modifing GroundingDINOModel class
+    ########################################## VISION BACKBONE - START
+    # patch embedding layer
+    rename_keys.append(("module.backbone.0.patch_embed.proj.weight", 
+                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("module.backbone.0.patch_embed.proj.bias", 
+                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
+    rename_keys.append(("module.backbone.0.patch_embed.norm.weight", 
+                        "model.backbone.conv_encoder.model.embeddings.norm.weight"))
+    rename_keys.append(("module.backbone.0.patch_embed.norm.bias", 
+                        "model.backbone.conv_encoder.model.embeddings.norm.bias"))
+
+    for layer, depth in enumerate(config.backbone_config.depths):
+        for block in range(depth):
+            # layernorms
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
+            
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+                                f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
+            # attention
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
+            # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
+            # intermidiate
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
+            
+            # output
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
+            
+        # downsample
+        if layer!=len(config.backbone_config.depths)-1:
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
+    
+    for out_indice in config.backbone_config.out_indices:
+        # Grounding DINO implementation of out_indices isn't aligned with transformers
+        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", 
+                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
+        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", 
+                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
+        
+    ########################################## VISION BACKBONE - END
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    ########################################## VISION BACKBONE - START
+    embed_dim = config.backbone_config.embed_dim
+    for layer, depth in enumerate(config.backbone_config.depths):
+        hidden_size = embed_dim * 2**layer
+        for block in range(depth):
+            # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+            in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
+            in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
+            # next, add query, keys and values (in that order) to the state dict
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
+
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
+
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :]
+    ########################################## VISION BACKBONE - END
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return image
+
+@torch.no_grad()
+def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
+    #Define default GroundingDINO configuation
+    config = get_grounding_dino_config(model_name)
+
+    # Load original checkpoint
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+
+    # Rename keys
+    new_state_dict = original_state_dict.copy()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(new_state_dict, src, dest)
+    read_in_q_k_v(new_state_dict, config)
+
+    # Load HF implementation with default config and converted state dict
+    model = GroundingDINOForObjectDetection(config).eval()
+    model.load_state_dict(new_state_dict, strict=False)
+
+    # Load and process test image
+    image = prepare_img()
+    image_processor = T.Compose(
+        [
+            T.Resize(size=800, max_size=1333),
+            T.ToTensor(), 
+            T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+        ]
+    )
+    inputs = image_processor(image)
+    pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
+    output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    for feature_map in output.feature_maps:
+        print(f"{feature_map.shape}")
+        print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
+
+    # outputs = model(**inputs).logits
+
+    # print(outputs.keys())
+    # print("Looks ok!")
+
+    # if pytorch_dump_folder_path is not None:
+    #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    #     model.save_pretrained(pytorch_dump_folder_path)
+
+    #     print(f"Saving image processor to {pytorch_dump_folder_path}")
+    #     image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    # if push_to_hub:
+    #     print(f"Pushing model and image processor for {model_name} to hub")
+    #     model.push_to_hub(f"microsoft/{model_name}")
+    #     image_processor.push_to_hub(f"microsoft/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="grounding-dino-tiny",
+        type=str,
+        choices=["grounding-dino-tiny", "grounding-dino-base"],
+        help="Name of the GroundingDINO model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth",
+        type=str,
+        help="Path to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path)
\ No newline at end of file
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
deleted file mode 100644
index d3cef0366b2bca..00000000000000
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Grounding DINO checkpoints."""
-
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import cached_download, hf_hub_url
-from PIL import Image
-
-from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_key(orig_key):
-    if "backbone.0.body" in orig_key:
-        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
-    if "transformer" in orig_key:
-        orig_key = orig_key.replace("transformer.", "")
-    if "norm1" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
-    if "norm2" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm2", "final_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
-    if "norm3" in orig_key:
-        orig_key = orig_key.replace("norm3", "final_layer_norm")
-    if "linear1" in orig_key:
-        orig_key = orig_key.replace("linear1", "fc1")
-    if "linear2" in orig_key:
-        orig_key = orig_key.replace("linear2", "fc2")
-    if "query_embed" in orig_key:
-        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
-    if "cross_attn" in orig_key:
-        orig_key = orig_key.replace("cross_attn", "encoder_attn")
-
-    return orig_key
-
-
-def read_in_q_k_v(state_dict):
-    # transformer decoder self-attention layers
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_grounding_dino_checkpoint(
-    checkpoint_path,
-    single_scale,
-    dilation,
-    with_box_refine,
-    two_stage,
-    pytorch_dump_folder_path,
-    push_to_hub,
-):
-    """
-    Copy/paste/tweak model's weights to our Grounding DINO structure.
-    """
-
-    # load default config
-    config = GroundingDINOConfig()
-    # set config attributes
-    if single_scale:
-        config.num_feature_levels = 1
-    config.dilation = dilation
-    config.with_box_refine = with_box_refine
-    config.two_stage = two_stage
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    image_processor = DeformableDetrImageProcessor(format="coco_detection")
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = GroundingDINOForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    # verify our conversion
-    outputs = model(pixel_values.to(device))
-
-    expected_logits = torch.tensor(
-        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
-    )
-    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
-
-    if single_scale:
-        expected_logits = torch.tensor(
-            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
-        )
-        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
-
-    if single_scale and dilation:
-        expected_logits = torch.tensor(
-            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
-        )
-        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
-
-    if with_box_refine:
-        expected_logits = torch.tensor(
-            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
-        )
-        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
-
-    if with_box_refine and two_stage:
-        expected_logits = torch.tensor(
-            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
-        )
-        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
-
-    print("Logits:", outputs.logits[0, :3, :3])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-
-    print("Everything ok!")
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        model_name = "deformable-detr"
-        model_name += "-single-scale" if single_scale else ""
-        model_name += "-dc5" if dilation else ""
-        model_name += "-with-box-refine" if with_box_refine else ""
-        model_name += "-two-stage" if two_stage else ""
-        print("Pushing model to hub...")
-        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth",
-        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
-    )
-    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
-    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
-    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
-    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_grounding_dino_checkpoint(
-        args.checkpoint_path,
-        args.single_scale,
-        args.dilation,
-        args.with_box_refine,
-        args.two_stage,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ee80a562e4b851..603bdfdd8e8126 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -122,10 +122,10 @@ def backward(context, grad_output):
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDINOConfig"
-_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny"
+_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny"
 
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "idea-research/grg-dino-tiny",
+    "idea-research/grounding-dino-tiny",
     # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
 ]
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5724e689f2fce2..f0bc1e774383b5 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2340,6 +2340,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GroundingDINOForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From f2052b0f44d2157de631adfdd0ccfb53ba7ff7bf Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 27 Aug 2023 01:47:21 -0300
Subject: [PATCH 003/252] Added bert to model

---
 .../configuration_grounding_dino.py           |   7 +-
 .../convert_grounding_dino_to_hf.py           |  13 +-
 .../grounding_dino/modeling_grounding_dino.py | 686 +++++++++++++++++-
 3 files changed, 692 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 23cd86fd3f9d44..9025d01e725561 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -16,7 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,8 @@ class GroundingDINOConfig(PretrainedConfig):
         backbone_config (`PretrainedConfig` or `dict`, *optional*):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
+        text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`):
+            The configuration of the text backbone model. Should be a bert-like config.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 300):
@@ -153,6 +155,7 @@ def __init__(
         self,
         use_timm_backbone=False,
         backbone_config={"model_type": "swin"},
+        text_backbone_config="bert-base-uncased",
         num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
@@ -251,6 +254,8 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
+        # Text backbone
+        self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index b5de1d8a652c0e..d5b07b32c3f49f 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -60,7 +60,7 @@ def get_grounding_dino_config(model_name):
     return config
 
 
-def create_rename_keys(config):
+def create_rename_keys(state_dict, config):
     rename_keys = []
     # fmt: off
     #TODO names might change after modifing GroundingDINOModel class
@@ -126,10 +126,14 @@ def create_rename_keys(config):
         
     ########################################## VISION BACKBONE - END
 
+    ########################################## TEXT BACKBONE - START
+    for layer_name, params in state_dict.items():
+        if "module.bert" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
+    ########################################## TEXT BACKBONE - END
     # fmt: on
     return rename_keys
 
-
 def rename_key(dct, old, new):
     val = dct.pop(old)
     dct[new] = val
@@ -172,7 +176,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
 
     # Rename keys
     new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config)
+    rename_keys = create_rename_keys(original_state_dict, config)
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)
@@ -192,7 +196,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
     inputs = image_processor(image)
     pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
-    output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+
     for feature_map in output.feature_maps:
         print(f"{feature_map.shape}")
         print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 603bdfdd8e8126..8bea6eee50096e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -19,7 +19,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -39,8 +39,13 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...modeling_outputs import BaseModelOutput
+from ...modeling_outputs import (
+    BaseModelOutput, 
+    BaseModelOutputWithPoolingAndCrossAttentions, 
+    BaseModelOutputWithPastAndCrossAttentions
+)
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...pytorch_utils import meshgrid
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
@@ -173,7 +178,7 @@ class GroundingDINODecoderOutput(ModelOutput):
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOModelOutput(ModelOutput):
     """
-    Base class for outputs of the Grounding DINO encoder-decoder model.
+    Base class for outputs of the Deformable DETR encoder-decoder model.
 
     Args:
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
@@ -432,6 +437,7 @@ def __init__(self, config):
                     if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
                         parameter.requires_grad_(False)
 
+    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
         features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
@@ -600,7 +606,7 @@ def multi_scale_deformable_attention(
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOMultiscaleDeformableAttention(nn.Module):
     """
-    Multiscale deformable attention as proposed in Grounding DINO.
+    Multiscale deformable attention as proposed in Deformable DETR.
     """
 
     def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
@@ -736,7 +742,7 @@ class GroundingDINOMultiheadAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
     """
 
     def __init__(
@@ -1294,7 +1300,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some tweaks for Grounding DINO:
+    Some tweaks for Deformable DETR:
 
     - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
     - it also returns a stack of intermediate outputs and reference points from all decoding layers.
@@ -1310,7 +1316,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.gradient_checkpointing = False
 
-        # hack implementation for iterative bounding box refinement and two-stage Grounding DINO
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
         self.bbox_embed = None
         self.class_embed = None
 
@@ -1493,6 +1499,8 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+        # Create Text Extractor
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -1772,7 +1780,7 @@ def forward(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
             )
 
-            # hack implementation for two-stage Grounding DINO
+            # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
             enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
@@ -1850,7 +1858,7 @@ class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
 
-        # Grounding DINO encoder-decoder model
+        # Deformable DETR encoder-decoder model
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
@@ -2178,6 +2186,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         return losses
 
     @torch.no_grad()
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
     def loss_cardinality(self, outputs, targets, indices, num_boxes):
         """
         Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
@@ -2193,6 +2202,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes):
         losses = {"cardinality_error": card_err}
         return losses
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -2217,12 +2227,14 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
         losses["loss_giou"] = loss_giou.sum() / num_boxes
         return losses
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
     def _get_source_permutation_idx(self, indices):
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
         source_idx = torch.cat([source for (source, _) in indices])
         return batch_idx, source_idx
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx
     def _get_target_permutation_idx(self, indices):
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
@@ -2511,3 +2523,659 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     else:
         raise ValueError("Only 3-dimensional tensors are supported")
     return NestedTensor(tensor, mask)
+
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText
+class GroundingDINOTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
+class GroundingDINOTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText
+class GroundingDINOTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText
+class GroundingDINOTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = GroundingDINOTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText
+class GroundingDINOTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText
+class GroundingDINOTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText
+class GroundingDINOTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GroundingDINOTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = GroundingDINOTextIntermediate(config)
+        self.output = GroundingDINOTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText
+class GroundingDINOTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText
+class GroundingDINOTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
+class GroundingDINOTextModel(nn.Module):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = GroundingDINOTextEmbeddings(config)
+        self.encoder = GroundingDINOTextEncoder(config)
+
+        self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )

From 632f8a6fb043ddbccf1bfbb9582c9f0b9f583b38 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 29 Aug 2023 23:30:53 -0300
Subject: [PATCH 004/252] Bert validated

---
 .../configuration_grounding_dino.py           |   7 +-
 .../convert_grounding_dino_to_hf.py           | 105 ++++++++++++++++--
 .../grounding_dino/modeling_grounding_dino.py |   5 +-
 3 files changed, 106 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 9025d01e725561..0b4df30f6ee46f 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -177,7 +177,7 @@ def __init__(
         return_intermediate=True,
         auxiliary_loss=False,
         position_embedding_type="sine",
-        backbone="resnet50",
+        backbone="swin",
         use_pretrained_backbone=True,
         dilation=False,
         num_feature_levels=4,
@@ -196,6 +196,9 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
+        #other parameters
+        max_text_len = 256,
+        sub_sentence_present = True,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -256,6 +259,8 @@ def __init__(
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
         self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
+        self.max_text_len = max_text_len
+        self.sub_sentence_present = sub_sentence_present
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index d5b07b32c3f49f..d5ebc9281b8733 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -25,7 +25,7 @@
 import torchvision.transforms.functional as F
 
 from transformers import (
-    GroundingDINOConfig, GroundingDINOForObjectDetection
+    GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer
 )
 
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
@@ -166,6 +166,88 @@ def prepare_img():
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     return image
 
+def text_processor(text: str, config):
+    def preprocess_caption(caption: str) -> str:
+        result = caption.lower().strip()
+        if result.endswith("."):
+            return result
+        return result + "."
+    def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
+        """Generate attention mask between each pair of special tokens
+        Args:
+            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+            special_tokens_mask (list): special tokens mask.
+        Returns:
+            torch.Tensor: attention mask between each special tokens.
+        """
+        input_ids = tokenized["input_ids"]
+        bs, num_token = input_ids.shape
+        # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+        special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+        for special_token in special_tokens_list:
+            special_tokens_mask |= input_ids == special_token
+
+        # idxs: each row is a list of indices of special tokens
+        idxs = torch.nonzero(special_tokens_mask)
+
+        # generate attention mask and positional ids
+        attention_mask = (
+            torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+        )
+        position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+        cate_to_token_mask_list = [[] for _ in range(bs)]
+        previous_col = 0
+        for i in range(idxs.shape[0]):
+            row, col = idxs[i]
+            if (col == 0) or (col == num_token - 1):
+                attention_mask[row, col, col] = True
+                position_ids[row, col] = 0
+            else:
+                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+                position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                    0, col - previous_col, device=input_ids.device
+                )
+                c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
+                c2t_maski[previous_col + 1 : col] = True
+                cate_to_token_mask_list[row].append(c2t_maski)
+            previous_col = col
+
+        cate_to_token_mask_list = [
+            torch.stack(cate_to_token_mask_listi, dim=0)
+            for cate_to_token_mask_listi in cate_to_token_mask_list
+        ]
+
+        # # padding mask
+        # padding_mask = tokenized['attention_mask']
+        # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+        return attention_mask, position_ids.to(torch.long)
+    tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
+    special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+    text = preprocess_caption(text)
+    tokenized = tokenizer([text], padding="longest", return_tensors="pt")
+    text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_tokens)
+    
+    max_text_len = config.max_text_len
+    sub_sentence_present = config.sub_sentence_present
+    if text_self_attention_masks.shape[1] > max_text_len:
+        text_self_attention_masks = text_self_attention_masks[
+            :, : max_text_len, : max_text_len
+        ]
+        position_ids = position_ids[:, : max_text_len]
+        tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len]
+
+    # extract text embeddings
+    if sub_sentence_present:
+        tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+        tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+        tokenized_for_encoder["position_ids"] = position_ids
+
+    return tokenized_for_encoder
+
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     #Define default GroundingDINO configuation
@@ -187,6 +269,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
 
     # Load and process test image
     image = prepare_img()
+    text = "a cat"
     image_processor = T.Compose(
         [
             T.Resize(size=800, max_size=1333),
@@ -194,13 +277,21 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
             T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
         ]
     )
-    inputs = image_processor(image)
-    pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
-    output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    image_inputs = image_processor(image)
+    text_inputs = text_processor(text, config)
+
+    pixel_mask = torch.ones(
+        ((1, image_inputs.shape[1], image_inputs.shape[2])), 
+        dtype=torch.long, 
+        device=image_inputs.device
+    )
+    # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0))
+    output = model.model.text_backbone(**text_inputs)
+    print(output.last_hidden_state[:, :, :5])
 
-    for feature_map in output.feature_maps:
-        print(f"{feature_map.shape}")
-        print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
+    # for feature_map in output.last_hidden_state:
+    #     print(f"{feature_map.shape}")
+    #     print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
 
     # outputs = model(**inputs).logits
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8bea6eee50096e..ebe151de480211 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3014,7 +3014,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 # Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
-class GroundingDINOTextModel(nn.Module):
+class GroundingDINOTextModel(PreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -3028,8 +3028,7 @@ class GroundingDINOTextModel(nn.Module):
     """
 
     def __init__(self, config, add_pooling_layer=True):
-        super().__init__()
-        self.config = config
+        super().__init__(config)
 
         self.embeddings = GroundingDINOTextEmbeddings(config)
         self.encoder = GroundingDINOTextEncoder(config)

From e04de0ec32d94e9cfe01b7f631a4f74e54287a5d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 31 Aug 2023 20:03:28 -0300
Subject: [PATCH 005/252] Created Text and Fusion layers for Encoder

---
 .../configuration_grounding_dino.py           |   2 +-
 .../grounding_dino/modeling_grounding_dino.py | 309 +++++++++++++++++-
 2 files changed, 306 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0b4df30f6ee46f..e77d4be247b746 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -160,7 +160,7 @@ def __init__(
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
-        encoder_ffn_dim=1024,
+        encoder_ffn_dim=2048,
         encoder_attention_heads=8,
         decoder_layers=6,
         decoder_ffn_dim=1024,
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ebe151de480211..731172570c23d2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -854,6 +854,304 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+# Repeting some code to avoid convert nn.MultiheadAttention later
+class GroundingDINOEncoderTextLayer(nn.Module):
+    def __init__(
+            self, 
+            embed_dim,
+            num_heads,
+            ffn_dim: int,
+            dropout: float = 0.0,
+            bias: bool = True,
+            activation: str = 'relu'
+            ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.fc1 = nn.Linear(embed_dim, ffn_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(ffn_dim, embed_dim)
+
+        self.layer_norm_before = nn.LayerNorm(embed_dim)
+        self.layer_norm_after = nn.LayerNorm(embed_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+        self.num_heads = num_heads
+
+    def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
+        return hidden_state if position_embeddings is None else hidden_state + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_masks: Optional[Tensor] = None,
+        position_embeddings: Optional[Tensor] = None,
+    ):    # repeat attn mask
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]:
+            # bs, num_q, num_k
+            attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
+
+        q = k = self.with_pos_embed(hidden_states, position_embeddings)
+        attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0]
+
+        hidden_states = hidden_states + self.dropout1(attention_output)
+        hidden_states = self.layer_norm_before(hidden_states)
+        hidden_states = self.activation(self.fc1(hidden_states))
+        attention_output = self.fc2(self.dropout(hidden_states))
+        hidden_states = hidden_states + self.dropout2(attention_output)
+        hidden_states = self.layer_norm_after(hidden_states)
+        return hidden_states
+
+class BiMultiHeadAttention(nn.Module):
+    def __init__(
+            self,
+            vision_dim: int,
+            text_dim: int,
+            embed_dim: int,
+            num_heads: int,
+            dropout:float = 0.1
+        ):
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.vision_dim = vision_dim
+        self.text_dim = text_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_dim, self.embed_dim)
+        self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim)
+
+        self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
+        self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.vision_proj.weight)
+        self.vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.text_proj.weight)
+        self.text_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_vision_proj.weight)
+        self.values_vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_text_proj.weight)
+        self.values_text_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_vision_proj.weight)
+        self.out_vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_text_proj.weight)
+        self.out_text_proj.bias.data.fill_(0)
+
+    def forward(
+            self, 
+            vision_features: Tensor, 
+            text_features: Tensor, 
+            vision_attention_mask: Optional[Tensor] = None, 
+            text_attention_mask: Optional[Tensor] = None
+        ):
+        """_summary_
+
+        Args:
+            vision_features Tensor: bs, n_img, dim
+            text_features Tensor: bs, n_text, dim
+            vision_attention_mask (Tensor, optional): _description_. bs, n_img
+            text_attention_mask (Tensor, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = vision_features.size()
+
+        vision_query_states = self.vision_proj(vision_features) * self.scale
+        vision_query_states = self._shape(vision_query_states, tgt_len, bsz)
+
+        text_key_states = self.text_proj(text_features)
+        text_key_states = self._shape(text_key_states, -1, bsz)
+
+        vision_value_states = self.values_vision_proj(vision_features)
+        vision_value_states = self._shape(vision_value_states, -1, bsz)
+
+        text_value_states = self.values_text_proj(text_features)
+        text_value_states = self._shape(text_value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+
+        vision_query_states = vision_query_states.view(*proj_shape)
+        text_key_states = text_key_states.view(*proj_shape)
+        vision_value_states = vision_value_states.view(*proj_shape)
+        text_value_states = text_value_states.view(*proj_shape)
+
+        src_len = text_key_states.size(1)
+        attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        attn_weights = attn_weights - attn_weights.max()
+
+        attn_weights = torch.clamp(
+                attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        attn_weights = torch.clamp(
+                attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
+        
+        text_attn_weights = torch.clamp(
+                text_attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        text_attn_weights = torch.clamp(
+                text_attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if vision_attention_mask is not None:
+            vision_attention_mask = (
+                vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            text_attn_weights.masked_fill_(vision_attention_mask, float("-inf"))
+
+        text_attn_weights = text_attn_weights.softmax(dim=-1)
+
+        # mask language for vision
+        if text_attention_mask is not None:
+            text_attention_mask = (
+                text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            attn_weights.masked_fill_(text_attention_mask, float("-inf"))
+        vision_attn_weights = attn_weights.softmax(dim=-1)
+
+        vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training)
+        text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training)
+
+        vision_attn_output = torch.bmm(vision_attn_probs, text_value_states)
+        text_attn_output = torch.bmm(text_attn_probs, vision_value_states)
+
+        if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
+            )
+
+        if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
+            )
+
+        vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        vision_attn_output = vision_attn_output.transpose(1, 2)
+        vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim)
+        text_attn_output = text_attn_output.transpose(1, 2)
+        text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim)
+
+        vision_attn_output = self.out_vision_proj(vision_attn_output)
+        text_attn_output = self.out_text_proj(text_attn_output)
+
+        return vision_attn_output, text_attn_output
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO
+class GroundingDINODropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+    
+class GroundingDINOBiAttention(nn.Module):
+    def __init__(
+        self,
+        vision_dim,
+        text_dim,
+        embed_dim,
+        num_heads,
+        dropout=0.1,
+        drop_path=0.0,
+        init_values=1e-4,
+    ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super().__init__()
+
+        # pre layer norm
+        self.layer_norm_vision = nn.LayerNorm(vision_dim)
+        self.layer_norm_text = nn.LayerNorm(text_dim)
+        self.attn = BiMultiHeadAttention(
+            vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        )
+
+        # add layer scale for training stability
+        self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True)
+
+    def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
+        vision_features = self.layer_norm_vision(vision_features)
+        text_features = self.layer_norm_text(text_features)
+        delta_v, delta_l = self.attn(
+            vision_features, 
+            text_features, 
+            attention_mask_vision=attention_mask_vision, 
+            attention_mask_text=attention_mask_text
+        )
+        # vision_features, text_features = vision_features + delta_v, text_features + delta_l
+        vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
+        text_features = text_features + self.drop_path(self.gamma_l * delta_l)
+        return vision_features, text_features
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINOEncoderLayer(nn.Module):
@@ -1499,8 +1797,6 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
-        # Create Text Extractor
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -1850,7 +2146,6 @@ def forward(
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
 class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
     _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
@@ -1866,6 +2161,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)
@@ -2588,6 +2884,8 @@ def forward(
         embeddings = self.dropout(embeddings)
         return embeddings
 
+# Classes for Text Backbone (It's just a BERT model)
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
 class GroundingDINOTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3013,7 +3311,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         pooled_output = self.activation(pooled_output)
         return pooled_output
 
-# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
 class GroundingDINOTextModel(PreTrainedModel):
     """
 
@@ -3029,12 +3326,16 @@ class GroundingDINOTextModel(PreTrainedModel):
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
+        self.config = config
 
         self.embeddings = GroundingDINOTextEmbeddings(config)
         self.encoder = GroundingDINOTextEncoder(config)
 
         self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 

From 619e0962f8a2c0522a308852708c8055b154caf3 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 31 Aug 2023 20:59:26 -0300
Subject: [PATCH 006/252] Adapted Encoder layer

---
 .../configuration_grounding_dino.py           |   8 +
 .../grounding_dino/modeling_grounding_dino.py | 180 +++++++++++++-----
 2 files changed, 137 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e77d4be247b746..3abf4912ebb651 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -199,6 +199,9 @@ def __init__(
         #other parameters
         max_text_len = 256,
         sub_sentence_present = True,
+        text_enhancer_dropout = 0.0,
+        fusion_droppath = 0.1,
+        fusion_dropout = 0.0,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -261,6 +264,11 @@ def __init__(
         self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
         self.max_text_len = max_text_len
         self.sub_sentence_present = sub_sentence_present
+        # Text Enhancer
+        self.text_enhancer_dropout = text_enhancer_dropout
+        # Fusion
+        self.fusion_droppath = fusion_droppath
+        self.fusion_dropout = fusion_dropout
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 731172570c23d2..91129946c6141e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -855,30 +855,28 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 # Repeting some code to avoid convert nn.MultiheadAttention later
-class GroundingDINOEncoderTextLayer(nn.Module):
-    def __init__(
-            self, 
-            embed_dim,
-            num_heads,
-            ffn_dim: int,
-            dropout: float = 0.0,
-            bias: bool = True,
-            activation: str = 'relu'
-            ):
+#TODO is this an approriate way to name this?
+class GroundingDINOTextEnhancerLayer(nn.Module):
+    """Vanilla Transformer with text embeddings as input"""
+    def __init__(self, config):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=config.d_model, 
+            num_heads=config.num_heads // 2, 
+            dropout=config.text_enhancer_dropout
+            )
         # Implementation of Feedforward model
-        self.fc1 = nn.Linear(embed_dim, ffn_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.fc2 = nn.Linear(ffn_dim, embed_dim)
+        self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
+        self.dropout = nn.Dropout(config.text_enhancer_dropout)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
 
-        self.layer_norm_before = nn.LayerNorm(embed_dim)
-        self.layer_norm_after = nn.LayerNorm(embed_dim)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
+        self.layer_norm_before = nn.LayerNorm(config.d_model)
+        self.layer_norm_after = nn.LayerNorm(config.d_model)
+        self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
+        self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
-        self.activation = ACT2FN[activation]
-        self.num_heads = num_heads
+        self.activation = ACT2FN[config.activation_fuction]
+        self.num_heads = config.num_heads // 2
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -903,8 +901,8 @@ def forward(
         hidden_states = hidden_states + self.dropout2(attention_output)
         hidden_states = self.layer_norm_after(hidden_states)
         return hidden_states
-
-class BiMultiHeadAttention(nn.Module):
+    
+class GroundingDINOBiMultiHeadAttention(nn.Module):
     def __init__(
             self,
             vision_dim: int,
@@ -1106,38 +1104,26 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
     
-class GroundingDINOBiAttention(nn.Module):
-    def __init__(
-        self,
-        vision_dim,
-        text_dim,
-        embed_dim,
-        num_heads,
-        dropout=0.1,
-        drop_path=0.0,
-        init_values=1e-4,
-    ):
-        """
-        Inputs:
-            embed_dim - Dimensionality of input and attention feature vectors
-            hidden_dim - Dimensionality of hidden layer in feed-forward network
-                         (usually 2-4x larger than embed_dim)
-            num_heads - Number of heads to use in the Multi-Head Attention block
-            dropout - Amount of dropout to apply in the feed-forward network
-        """
+class GroundingDINOFusionLayer(nn.Module):
+    def __init__(self, config, init_values=1e-4):
         super().__init__()
+        drop_path = config.fusion_droppath
 
         # pre layer norm
-        self.layer_norm_vision = nn.LayerNorm(vision_dim)
-        self.layer_norm_text = nn.LayerNorm(text_dim)
-        self.attn = BiMultiHeadAttention(
-            vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        self.layer_norm_vision = nn.LayerNorm(config.d_model)
+        self.layer_norm_text = nn.LayerNorm(config.d_model)
+        self.attn = GroundingDINOBiMultiHeadAttention(
+            vision_dim=config.d_model, 
+            text_dim=config.d_model, 
+            embed_dim=config.encoder_ffn_dim // 2, 
+            num_heads=config.num_heads // 2, 
+            dropout=config.fusion_dropout
         )
 
         # add layer scale for training stability
         self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True)
-        self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True)
+        self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
@@ -1153,8 +1139,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         text_features = text_features + self.drop_path(self.gamma_l * delta_l)
         return vision_features, text_features
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
-class GroundingDINOEncoderLayer(nn.Module):
+#NOTE just renamed the class
+class GroundingDINODeformableLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
         self.embed_dim = config.d_model
@@ -1238,6 +1224,98 @@ def forward(
 
         return outputs
 
+def get_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+    ) -> Tensor:
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+
+
+class GroundingDINOEncoderLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init_()
+        self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
+        self.fusion_layer = GroundingDINOFusionLayer(config)
+        self.deformable_layer = GroundingDINODeformableLayer(config)
+
+    def forward(
+            self,
+            vision_features: Tensor,
+            vision_position_embedding: Tensor,
+            spatial_shapes: Tensor,
+            level_start_index: Tensor,
+            key_padding_mask: Tensor,
+            reference_points: Tensor,
+            text_features: Optional[Tensor] = None,
+            text_attention_mask: Optional[Tensor] = None,
+            text_position_embedding: Optional[Tensor] = None,
+            text_self_attention_masks: Optional[Tensor] = None,
+            text_position_ids: Optional[Tensor] = None
+        ):
+        bs, n_text, text_dim = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            pos_text = (
+                torch.arange(n_text, device=text_features.device)
+                .float()
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .repeat(bs, 1, 1)
+            )
+            pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
+            )
+
+        vision_features, text_features = self.fusion_layer(
+            vision_features=vision_features,
+            text_features=text_features,
+            attention_mask_vision=key_padding_mask,
+            attention_mask_text=text_attention_mask,
+        )
+
+        text_features = self.text_enhancer_layer(
+            hidden_states=text_features.transpose(0, 1),
+            attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
+            position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None),
+        ).transpose(0, 1)
+
+        vision_features = self.deformable_layer(
+            hidden_states=vision_features,
+            attention_mask=key_padding_mask,
+            position_embeddings=vision_position_embedding,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+        )
+
+        return vision_features, text_features
+
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINODecoderLayer(nn.Module):
@@ -1788,7 +1866,6 @@ def custom_forward(*inputs):
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
 class GroundingDINOModel(GroundingDINOPreTrainedModel):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
@@ -1797,6 +1874,8 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+        # Create text backbone
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -2161,7 +2240,6 @@ def __init__(self, config: GroundingDINOConfig):
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)

From 52fa847a77d42bea476070ec51a3cb125cf73928 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 1 Sep 2023 11:37:07 -0300
Subject: [PATCH 007/252] Fixed typos

---
 .../grounding_dino/modeling_grounding_dino.py | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 91129946c6141e..984587d3997d67 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -862,7 +862,7 @@ def __init__(self, config):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(
             embed_dim=config.d_model, 
-            num_heads=config.num_heads // 2, 
+            num_heads=config.encoder_attention_heads // 2, 
             dropout=config.text_enhancer_dropout
             )
         # Implementation of Feedforward model
@@ -875,8 +875,8 @@ def __init__(self, config):
         self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
         self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
-        self.activation = ACT2FN[config.activation_fuction]
-        self.num_heads = config.num_heads // 2
+        self.activation = ACT2FN[config.activation_function]
+        self.num_heads = config.encoder_attention_heads // 2
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -1116,7 +1116,7 @@ def __init__(self, config, init_values=1e-4):
             vision_dim=config.d_model, 
             text_dim=config.d_model, 
             embed_dim=config.encoder_ffn_dim // 2, 
-            num_heads=config.num_heads // 2, 
+            num_heads=config.encoder_attention_heads // 2, 
             dropout=config.fusion_dropout
         )
 
@@ -1258,25 +1258,25 @@ def sine_func(x: torch.Tensor):
 
 class GroundingDINOEncoderLayer(nn.Module):
     def __init__(self, config) -> None:
-        super().__init_()
+        super().__init__()
         self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
         self.fusion_layer = GroundingDINOFusionLayer(config)
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
     def forward(
-            self,
-            vision_features: Tensor,
-            vision_position_embedding: Tensor,
-            spatial_shapes: Tensor,
-            level_start_index: Tensor,
-            key_padding_mask: Tensor,
-            reference_points: Tensor,
-            text_features: Optional[Tensor] = None,
-            text_attention_mask: Optional[Tensor] = None,
-            text_position_embedding: Optional[Tensor] = None,
-            text_self_attention_masks: Optional[Tensor] = None,
-            text_position_ids: Optional[Tensor] = None
-        ):
+        self,
+        vision_features: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
+        key_padding_mask: Tensor,
+        reference_points: Tensor,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None
+    ):
         bs, n_text, text_dim = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
             pos_text = (

From a527a836a55c8110f07cd5ed412546c5cc52e0db Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 4 Sep 2023 13:08:37 -0300
Subject: [PATCH 008/252] Adjusted Encoder

---
 .../grounding_dino/modeling_grounding_dino.py | 234 +++++++++++++-----
 1 file changed, 176 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 984587d3997d67..229c5d89c716f9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -173,6 +173,55 @@ class GroundingDINODecoderOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+@dataclass
+class GroundingDINOEncoderOutput(ModelOutput):
+    """
+    Base class for outputs of the GroundingDINOEncoder. This class extends
+    BaseModelOutput, due to:
+    - vision and text last hidden states
+    - vision and text intermediate hidden states
+    - vision and text attentions
+    - vision and text cross attentions
+
+    Args:
+        last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the vision encoder.
+        last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the text encoder.
+        hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer
+            plus the initial embedding outputs.
+        hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer
+            plus the initial embedding outputs.
+        attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+    """
+    last_hidden_state_vision: torch.FloatTensor = None
+    last_hidden_state_text: torch.FloatTensor = None
+    hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+
 
 @dataclass
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
@@ -892,7 +941,7 @@ def forward(
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
-        attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0]
+        attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)
 
         hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
@@ -900,7 +949,7 @@ def forward(
         attention_output = self.fc2(self.dropout(hidden_states))
         hidden_states = hidden_states + self.dropout2(attention_output)
         hidden_states = self.layer_norm_after(hidden_states)
-        return hidden_states
+        return hidden_states, attention_weights
     
 class GroundingDINOBiMultiHeadAttention(nn.Module):
     def __init__(
@@ -933,10 +982,6 @@ def __init__(
         self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
         self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
 
-        self.stable_softmax_2d = True
-        self.clamp_min_for_underflow = True
-        self.clamp_max_for_overflow = True
-
         self._reset_parameters()
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
@@ -1068,7 +1113,7 @@ def forward(
         vision_attn_output = self.out_vision_proj(vision_attn_output)
         text_attn_output = self.out_text_proj(text_attn_output)
 
-        return vision_attn_output, text_attn_output
+        return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
 def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
@@ -1128,16 +1173,16 @@ def __init__(self, config, init_values=1e-4):
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
         text_features = self.layer_norm_text(text_features)
-        delta_v, delta_l = self.attn(
+        (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
             vision_features, 
             text_features, 
             attention_mask_vision=attention_mask_vision, 
             attention_mask_text=attention_mask_text
         )
-        # vision_features, text_features = vision_features + delta_v, text_features + delta_l
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
-        text_features = text_features + self.drop_path(self.gamma_l * delta_l)
-        return vision_features, text_features
+        text_features = text_features + self.drop_path(self.gamma_l * delta_t)
+
+        return (vision_features, vision_attn), (text_features, text_attn)
 
 #NOTE just renamed the class
 class GroundingDINODeformableLayer(nn.Module):
@@ -1263,6 +1308,29 @@ def __init__(self, config) -> None:
         self.fusion_layer = GroundingDINOFusionLayer(config)
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
+    def get_text_position_embeddings(
+            self, 
+            text_features: Tensor, 
+            text_position_embedding: Tensor, 
+            text_position_ids: Tensor
+        ) -> Tensor:
+        bs, n_text, text_dim = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            text_position_embedding = (
+                torch.arange(n_text, device=text_features.device)
+                .float()
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .repeat(bs, 1, 1)
+            )
+            text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
+            )
+        
+        return text_position_embedding
+
     def forward(
         self,
         vision_features: Tensor,
@@ -1277,35 +1345,28 @@ def forward(
         text_self_attention_masks: Optional[Tensor] = None,
         text_position_ids: Optional[Tensor] = None
     ):
-        bs, n_text, text_dim = text_features.shape
-        if text_position_embedding is None and text_position_ids is None:
-            pos_text = (
-                torch.arange(n_text, device=text_features.device)
-                .float()
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .repeat(bs, 1, 1)
-            )
-            pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
-        if text_position_ids is not None:
-            text_position_embedding = get_sine_pos_embed(
-                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
-            )
+        text_position_embedding = self.get_text_position_embeddings(
+            text_features, 
+            text_position_embedding, 
+            text_position_ids
+        )
 
-        vision_features, text_features = self.fusion_layer(
+        (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
             vision_features=vision_features,
             text_features=text_features,
             attention_mask_vision=key_padding_mask,
             attention_mask_text=text_attention_mask,
         )
 
-        text_features = self.text_enhancer_layer(
+        (text_features, text_enhanced_attn) = self.text_enhancer_layer(
             hidden_states=text_features.transpose(0, 1),
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None),
+            position_embeddings=(
+                text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None
+            ),
         ).transpose(0, 1)
 
-        vision_features = self.deformable_layer(
+        (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,
             attention_mask=key_padding_mask,
             position_embeddings=vision_position_embedding,
@@ -1314,7 +1375,10 @@ def forward(
             level_start_index=level_start_index,
         )
 
-        return vision_features, text_features
+        return (
+            (vision_features, text_features), 
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn)
+        )
 
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
@@ -1538,7 +1602,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO
 class GroundingDINOEncoder(GroundingDINOPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
@@ -1592,26 +1655,31 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
 
     def forward(
         self,
-        inputs_embeds=None,
-        attention_mask=None,
-        position_embeddings=None,
-        spatial_shapes=None,
-        level_start_index=None,
+        vision_features: Tensor,
+        vision_attention_mask: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
         valid_ratios=None,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
         r"""
         Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
                 - 1 for pixel features that are real (i.e. **not masked**),
                 - 0 for pixel features that are padding (i.e. **masked**).
                 [What are attention masks?](../glossary#attention-mask)
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
             spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                 Spatial shapes of each feature map.
@@ -1619,6 +1687,21 @@ def forward(
                 Starting index of each feature map.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                 Ratio of valid area in each feature level.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Flattened text features that are passed to the encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
+                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
+                Position ids for text features.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1634,41 +1717,76 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = inputs_embeds
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        #TODO check if this is necessary according to original implementation
+        vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training)
 
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
 
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
+        encoder_vision_states = () if output_hidden_states else None
+        encoder_text_states = () if output_hidden_states else None
+        all_attn_fused_text = () if output_attentions else None
+        all_attn_fused_vision = () if output_attentions else None
+        all_attn_enhanced_text = () if output_attentions else None
+        all_attn_deformable = () if output_attentions else None
         for i, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(
-                hidden_states,
-                attention_mask,
-                position_embeddings=position_embeddings,
-                reference_points=reference_points,
+                encoder_vision_states += (vision_features,)
+                encoder_text_states += (text_features,)
+            # INPUTS FOR ENCODER LAYER
+            #   - vision_features: Tensor,
+            #   - vision_position_embedding: Tensor,
+            #   - spatial_shapes: Tensor,
+            #   - level_start_index: Tensor,
+            #   - key_padding_mask: Tensor,
+            #   - reference_points: Tensor,
+            #   - text_features: Optional[Tensor] = None,
+            #   - text_attention_mask: Optional[Tensor] = None,
+            #   - text_position_embedding: Optional[Tensor] = None,
+            #   - text_self_attention_masks: Optional[Tensor] = None,
+            #   - text_position_ids: Optional[Tensor] = None
+            (vision_features, text_features), attentions = encoder_layer(
+                vision_features=vision_features,
+                vision_position_embedding=vision_position_embedding,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
-                output_attentions=output_attentions,
+                key_padding_mask=vision_attention_mask,
+                reference_points=reference_points,
+                text_features=text_features,
+                text_attention_mask=text_attention_mask,
+                text_position_embedding=text_position_embedding,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=text_position_ids
             )
 
-            hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
+                all_attn_fused_vision += (attentions[0],)
+                all_attn_fused_text += (attentions[1],)
+                all_attn_enhanced_text += (attentions[2],)
+                all_attn_deformable += (attentions[3],)
 
         if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
+            encoder_vision_states += (vision_features,)
+            encoder_text_states += (text_features,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            enc_outputs = [
+                vision_features, text_features,
+                all_attn_fused_vision, all_attn_fused_text, 
+                all_attn_enhanced_text, all_attn_deformable
+            ]
+            return tuple(v for v in enc_outputs if v is not None)
+        return GroundingDINOEncoderOutput(
+            last_hidden_state_vision=vision_features,
+            last_hidden_state_text=text_features,
+            hidden_states_vision=encoder_vision_states,
+            hidden_states_text=encoder_text_states,
+            cross_attentions_vision=all_attn_fused_vision,
+            cross_attentions_text=all_attn_fused_text,
+            attentions_vision=all_attn_deformable,
+            attentions_text=all_attn_enhanced_text
         )
 
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """

From 791943c244879fd054f161932e598f24045fb8eb Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 4 Sep 2023 13:09:56 -0300
Subject: [PATCH 009/252] Converted encoder to hf

---
 .../configuration_grounding_dino.py           |  2 +-
 .../convert_grounding_dino_to_hf.py           | 83 +++++++++++++++++++
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 3abf4912ebb651..14e82704cb495b 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -157,7 +157,7 @@ def __init__(
         backbone_config={"model_type": "swin"},
         text_backbone_config="bert-base-uncased",
         num_channels=3,
-        num_queries=300,
+        num_queries=900,
         max_position_embeddings=1024,
         encoder_layers=6,
         encoder_ffn_dim=2048,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index d5ebc9281b8733..f9fc7e87d12bba 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -131,6 +131,88 @@ def create_rename_keys(state_dict, config):
         if "module.bert" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
     ########################################## TEXT BACKBONE - END
+
+    ########################################## ENCODER - START
+    deformable_key_mappings = {
+        'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
+        'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias',
+        'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight',
+        'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias',
+        'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight',
+        'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias',
+        'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight',
+        'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias',
+        'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight',
+        'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias',
+        'linear1.weight': 'deformable_layer.fc1.weight',
+        'linear1.bias': 'deformable_layer.fc1.bias',
+        'linear2.weight': 'deformable_layer.fc2.weight',
+        'linear2.bias': 'deformable_layer.fc2.bias',
+        'norm2.weight': 'deformable_layer.final_layer_norm.weight',
+        'norm2.bias': 'deformable_layer.final_layer_norm.bias',
+    }
+    text_enhancer_key_mappings = {
+        'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight',
+        'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias',
+        'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight',
+        'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias',
+        'linear1.weight': 'text_enhancer_layer.fc1.weight',
+        'linear1.bias': 'text_enhancer_layer.fc1.bias',
+        'linear2.weight': 'text_enhancer_layer.fc2.weight',
+        'linear2.bias': 'text_enhancer_layer.fc2.bias',
+        'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight',
+        'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias',
+        'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight',
+        'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
+    }
+    fusion_key_mappings = {
+        'gamma_v': 'fusion_layer.gamma_v',
+        'gamma_l': 'fusion_layer.gamma_l',
+        'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
+        'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
+        'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
+        'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias',
+        'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight',
+        'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias',
+        'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight',
+        'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias',
+        'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight',
+        'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias',
+        'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight',
+        'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias',
+        'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight',
+        'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias',
+        'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
+        'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
+    }
+    
+    for layer in range(config.encoder_layers):
+        # deformable
+        for src, dest in deformable_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+        # text enhance
+        for src, dest in text_enhancer_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+        # fusion layers
+        for src, dest in fusion_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+    ########################################## ENCODER - END
+
+    #TODO convert decoder
+    ########################################## DECODER - START
+    ########################################## DECODER - END
+
+    #TODO convert head
+    ########################################## HEAD - START
+    ########################################## HEAD - END
+
+    #TODO convert additional layers
+    ########################################## Additional - START
+    ########################################## Additional - END
+
     # fmt: on
     return rename_keys
 
@@ -259,6 +341,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     # Rename keys
     new_state_dict = original_state_dict.copy()
     rename_keys = create_rename_keys(original_state_dict, config)
+    
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)

From 9c57788ee857fc77415b285120e71ba386ac0ba0 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 5 Sep 2023 16:10:51 -0300
Subject: [PATCH 010/252] Modified Decoder Layer

---
 .../grounding_dino/modeling_grounding_dino.py | 51 ++++++++++++++-----
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 229c5d89c716f9..9f6edac849f2c9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1388,7 +1388,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.embed_dim = config.d_model
 
         # self-attention
-        self.self_attn = GroundingDINOMultiheadAttention(
+        self.self_attn = nn.MultiheadAttention(
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -1398,6 +1398,13 @@ def __init__(self, config: GroundingDINOConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention text
+        self.encoder_attn_text = nn.MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
         self.encoder_attn = GroundingDINOMultiscaleDeformableAttention(
             config,
@@ -1410,6 +1417,9 @@ def __init__(self, config: GroundingDINOConfig):
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1417,8 +1427,11 @@ def forward(
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
+        vision_encoder_hidden_states: Optional[torch.Tensor] = None,
+        vision_encoder_attention_mask: Optional[torch.Tensor] = None,
+        text_encoder_hidden_states: Optional[torch.Tensor] = None,
+        text_encoder_attention_mask: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ):
         """
@@ -1446,9 +1459,10 @@ def forward(
 
         # Self Attention
         hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            output_attentions=output_attentions,
+            query=self.with_pos_embed(hidden_states, position_embeddings),
+            key=self.with_pos_embed(hidden_states, position_embeddings),
+            value=hidden_states,
+            attn_mask=self_attn_mask
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1457,13 +1471,27 @@ def forward(
 
         second_residual = hidden_states
 
+        # Cross-Attention Text
+        hidden_states, text_cross_attn_weights = self.encoder_attn_text(
+            query=self.with_pos_embed(hidden_states, position_embeddings),
+            key=text_encoder_hidden_states.transpose(0, 1),
+            value=text_encoder_hidden_states.transpose(0, 1),
+            attn_mask=text_encoder_attention_mask,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+        hidden_states = self.encoder_attn_text_layer_norm(hidden_states)
+
+        third_residual = hidden_states
+
         # Cross-Attention
         cross_attn_weights = None
         hidden_states, cross_attn_weights = self.encoder_attn(
             hidden_states=hidden_states,
-            attention_mask=encoder_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
+            attention_mask=vision_encoder_attention_mask,
+            encoder_hidden_states=vision_encoder_hidden_states,
+            encoder_attention_mask=vision_encoder_attention_mask,
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
@@ -1472,8 +1500,7 @@ def forward(
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = second_residual + hidden_states
-
+        hidden_states = third_residual + hidden_states
         hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
         # Fully Connected
@@ -1488,7 +1515,7 @@ def forward(
         outputs = (hidden_states,)
 
         if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
+            outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights)
 
         return outputs
 

From 962ce238a3fefb2f1f9735bb2843965a666f9f1b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 6 Sep 2023 14:33:57 -0300
Subject: [PATCH 011/252] Modified main decoder class

---
 .../configuration_grounding_dino.py           |  6 +--
 .../convert_grounding_dino_to_hf.py           | 37 ++++++++++++++
 .../grounding_dino/modeling_grounding_dino.py | 49 +++++++++++++------
 3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 14e82704cb495b..33de7c666cef19 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -163,7 +163,7 @@ def __init__(
         encoder_ffn_dim=2048,
         encoder_attention_heads=8,
         decoder_layers=6,
-        decoder_ffn_dim=1024,
+        decoder_ffn_dim=2048,
         decoder_attention_heads=8,
         encoder_layerdrop=0.0,
         is_encoder_decoder=True,
@@ -183,9 +183,9 @@ def __init__(
         num_feature_levels=4,
         encoder_n_points=4,
         decoder_n_points=4,
-        two_stage=False,
+        two_stage=True,
         two_stage_num_proposals=300,
-        with_box_refine=False,
+        with_box_refine=True,
         class_cost=1,
         bbox_cost=5,
         giou_cost=2,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index f9fc7e87d12bba..846892980d2d21 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -203,6 +203,43 @@ def create_rename_keys(state_dict, config):
 
     #TODO convert decoder
     ########################################## DECODER - START
+    key_mappings_decoder = {
+        'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
+        'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias',
+        'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight',
+        'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias',
+        'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight',
+        'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias',
+        'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight',
+        'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias',
+        'norm1.weight': 'encoder_attn_layer_norm.weight',
+        'norm1.bias': 'encoder_attn_layer_norm.bias',
+        'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight',
+        'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias',
+        'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight',
+        'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias',
+        'catext_norm.weight': 'encoder_attn_text_layer_norm.weight',
+        'catext_norm.bias': 'encoder_attn_text_layer_norm.bias',
+        'self_attn.in_proj_weight': 'self_attn.in_proj_weight',
+        'self_attn.in_proj_bias': 'self_attn.in_proj_bias',
+        'self_attn.out_proj.weight': 'self_attn.out_proj.weight',
+        'self_attn.out_proj.bias': 'self_attn.out_proj.bias',
+        'norm2.weight': 'self_attn_layer_norm.weight',
+        'norm2.bias': 'self_attn_layer_norm.bias',
+        'linear1.weight': 'fc1.weight',
+        'linear1.bias': 'fc1.bias',
+        'linear2.weight': 'fc2.weight',
+        'linear2.bias': 'fc2.bias',
+        'norm3.weight': 'final_layer_norm.weight',
+        'norm3.bias': 'final_layer_norm.bias',
+    }
+    for layer_num in range(config.decoder_layers):
+        source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.'
+        target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
+
+        for source_name, target_name in key_mappings_decoder.items():
+            rename_keys.append((source_prefix_decoder + source_name, 
+                               target_prefix_decoder + target_name))
     ########################################## DECODER - END
 
     #TODO convert head
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 9f6edac849f2c9..d57e823199703a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -160,10 +160,14 @@ class GroundingDINODecoderOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
             the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
             used to compute the weighted average in the cross-attention heads.
+        text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the text cross-attention heads.
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -171,7 +175,8 @@ class GroundingDINODecoderOutput(ModelOutput):
     intermediate_reference_points: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 @dataclass
 class GroundingDINOEncoderOutput(ModelOutput):
@@ -1814,7 +1819,6 @@ def forward(
             attentions_text=all_attn_enhanced_text
         )
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
@@ -1840,20 +1844,24 @@ def __init__(self, config: GroundingDINOConfig):
         # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
         self.bbox_embed = None
         self.class_embed = None
+        self.query_scale = None
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
+        inputs_embeds,
+        vision_encoder_hidden_states,
+        vision_encoder_attention_mask=None,
+        text_encoder_hidden_states=None,
+        text_encoder_attention_mask=None,
         position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
         valid_ratios=None,
+        self_attn_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1902,7 +1910,8 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
         intermediate = ()
         intermediate_reference_points = ()
 
@@ -1930,20 +1939,23 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
                     None,
                 )
             else:
                 layer_outputs = decoder_layer(
-                    hidden_states,
+                    hidden_states=hidden_states,
                     position_embeddings=position_embeddings,
-                    encoder_hidden_states=encoder_hidden_states,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
                     level_start_index=level_start_index,
-                    encoder_attention_mask=encoder_attention_mask,
-                    output_attentions=output_attentions,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions
                 )
 
             hidden_states = layer_outputs[0]
@@ -1970,8 +1982,12 @@ def custom_forward(*inputs):
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
 
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
@@ -2000,7 +2016,8 @@ def custom_forward(*inputs):
             intermediate_reference_points=intermediate_reference_points,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
+            vision_cross_attentions=all_cross_attns_vision,
+            text_cross_attentions=all_cross_attns_text
         )
 
 

From 9aedd7f0445db756e18217cc2ad7f6e67140f4c1 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 6 Sep 2023 14:38:56 -0300
Subject: [PATCH 012/252] Removed copy comments

---
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d57e823199703a..8cd584c1fcc71c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -137,7 +137,6 @@ def backward(context, grad_output):
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO
 class GroundingDINODecoderOutput(ModelOutput):
     """
     Base class for outputs of the GroundingDINODecoder. This class adds two attributes to
@@ -1153,7 +1152,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
-    
 class GroundingDINOFusionLayer(nn.Module):
     def __init__(self, config, init_values=1e-4):
         super().__init__()
@@ -1386,7 +1384,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINODecoderLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
@@ -2006,7 +2003,8 @@ def custom_forward(*inputs):
                     intermediate_reference_points,
                     all_hidden_states,
                     all_self_attns,
-                    all_cross_attentions,
+                    all_cross_attns_vision,
+                    all_cross_attns_text
                 ]
                 if v is not None
             )

From 65fb442af03ca42ea9254757125648cb2fa9a446 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 10 Sep 2023 23:21:17 -0300
Subject: [PATCH 013/252] Fixed forward from GroundingDINOModel and
 GroundingDINODecoder

---
 .../configuration_grounding_dino.py           |  14 ++
 .../convert_grounding_dino_to_hf.py           |   9 +
 .../grounding_dino/modeling_grounding_dino.py | 190 +++++++++++++-----
 3 files changed, 162 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 33de7c666cef19..bc43655df050ee 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -130,6 +130,18 @@ class GroundingDINOConfig(PretrainedConfig):
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
             Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
             kernels are not supported by PyTorch ONNX export.
+        max_text_len (`int`, *optional*, defaults to 256):
+            The maximum length of the text input.
+        sub_sentence_present (`bool`, *optional*, defaults to `True`):
+            Whether to use sub-sentence present in the text input.
+        text_enhancer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the text enhancer.
+        fusion_droppath (`float`, *optional*, defaults to 0.1):
+            The droppath ratio for the fusion module.
+        fusion_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the fusion module.
+        embedding_init_target (`bool`, *optional*, defaults to `True`):
+            Whether to initialize the target with Embedding weights.
 
     Examples:
 
@@ -202,6 +214,7 @@ def __init__(
         text_enhancer_dropout = 0.0,
         fusion_droppath = 0.1,
         fusion_dropout = 0.0,
+        embedding_init_target = True,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -269,6 +282,7 @@ def __init__(
         # Fusion
         self.fusion_droppath = fusion_droppath
         self.fusion_dropout = fusion_dropout
+        self.embedding_init_target = embedding_init_target
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 846892980d2d21..efced9cba0d522 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -248,6 +248,15 @@ def create_rename_keys(state_dict, config):
 
     #TODO convert additional layers
     ########################################## Additional - START
+    for layer_name, params in state_dict.items():
+        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
+        if "module.input_proj" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision")))
+            #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
+        if "module.feat_map" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text")))
+    #### 
+
     ########################################## Additional - END
 
     # fmt: on
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8cd584c1fcc71c..35ed14fa6859bc 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1521,6 +1521,27 @@ def forward(
 
         return outputs
 
+class GroundingDINOContrastiveEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_text_len = config.max_text_len
+
+    def forward(
+            self, 
+            vision_hidden_state: torch.FloatTensor, 
+            text_hiddend_state: torch.FloatTensor, 
+            text_token_mask: torch.BoolTensor
+        ) -> torch.FloatTensor:
+
+
+        output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2)
+        output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device)
+        new_output[..., : output.shape[-1]] = output
+
+        return new_output
 
 # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
 class GroundingDINOClassificationHead(nn.Module):
@@ -1836,6 +1857,12 @@ def __init__(self, config: GroundingDINOConfig):
 
         self.dropout = config.dropout
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = GroundingDINOMLPPredictionHead(
+            config.query_dim // 2 * config.d_model,
+            config.d_model,
+            config.d_model,
+            2
+        )
         self.gradient_checkpointing = False
 
         # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
@@ -1846,6 +1873,45 @@ def __init__(self, config: GroundingDINOConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTensor:
+        """Get the position embedding of the proposals."""
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries
+        pos_x = proposals[:, :, 0] * scale
+        pos_y = proposals[:, :, 1] * scale
+        # batch_size, num_queries, num_pos_feats
+        pos_x = pos_x[:, :, None] / dim_t
+        pos_y = pos_y[:, :, None] / dim_t
+        # batch_size, num_queries, num_pos_feats 
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        if proposals.size(-1) == 2:
+            # batch_size, num_queries, num_pos_feats * 2
+            pos = torch.cat((pos_y, pos_x), dim=2)
+        elif proposals.size(-1) == 4:
+            w_embed = proposals[:, :, 2] * scale
+            pos_w = w_embed[:, :, None] / dim_t
+            # batch_size, num_queries, num_pos_feats
+            pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+            h_embed = proposals[:, :, 3] * scale
+            pos_h = h_embed[:, :, None] / dim_t
+            # batch_size, num_queries, num_pos_feats
+            pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+            # batch_size, num_queries, num_pos_feats * 4
+            pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+        else:
+            raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1)))
+        return pos
+
+
+
     def forward(
         self,
         inputs_embeds,
@@ -1853,7 +1919,6 @@ def forward(
         vision_encoder_attention_mask=None,
         text_encoder_hidden_states=None,
         text_encoder_attention_mask=None,
-        position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
@@ -1875,8 +1940,6 @@ def forward(
                 in `[0, 1]`:
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
             reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
                 Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
             spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
@@ -1921,6 +1984,8 @@ def forward(
                 if reference_points.shape[-1] != 2:
                     raise ValueError("Reference points' last dimension must be of size 2")
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
+            query_pos = self.reference_points_head(query_pos)
 
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1943,7 +2008,7 @@ def custom_forward(*inputs):
             else:
                 layer_outputs = decoder_layer(
                     hidden_states=hidden_states,
-                    position_embeddings=position_embeddings,
+                    position_embeddings=query_pos,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
                     level_start_index=level_start_index,
@@ -2034,8 +2099,6 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
-        # Create text backbone
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -2057,9 +2120,9 @@ def __init__(self, config: GroundingDINOConfig):
                     )
                 )
                 in_channels = config.d_model
-            self.input_proj = nn.ModuleList(input_proj_list)
+            self.input_proj_vision = nn.ModuleList(input_proj_list)
         else:
-            self.input_proj = nn.ModuleList(
+            self.input_proj_vision = nn.ModuleList(
                 [
                     nn.Sequential(
                         nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
@@ -2068,8 +2131,12 @@ def __init__(self, config: GroundingDINOConfig):
                 ]
             )
 
-        if not config.two_stage:
-            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+        # Create text backbone
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
+        self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
+
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
         self.encoder = GroundingDINOEncoder(config)
         self.decoder = GroundingDINODecoder(config)
@@ -2079,10 +2146,8 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
-            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
-            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
         else:
-            self.reference_points = nn.Linear(config.d_model, 2)
+            self.reference_points = nn.Embedding(config.num_queries, 4)
 
         self.post_init()
 
@@ -2164,6 +2229,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
             proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
             _cur += height * width
+
         output_proposals = torch.cat(proposals, 1)
         output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
         output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
@@ -2181,12 +2247,15 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
     @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        token_type_ids: Tensor,
+        text_token_mask: Tensor,
+        text_self_attention_masks: Tensor,
+        position_ids: Tensor,
+        pixel_mask: Optional[Tensor]=None,
         encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -2221,6 +2290,10 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # Extract text features from text backbone
+        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"]
+        text_features = self.input_proj_text(text_features)
+
         batch_size, num_channels, height, width = pixel_values.shape
         device = pixel_values.device
 
@@ -2230,13 +2303,13 @@ def forward(
         # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
         # First, sent pixel_values + pixel_mask through Backbone to obtain the features
         # which is a list of tuples
-        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
 
         # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         sources = []
         masks = []
-        for level, (source, mask) in enumerate(features):
-            sources.append(self.input_proj[level](source))
+        for level, (source, mask) in enumerate(vision_features):
+            sources.append(self.input_proj_vision[level](source))
             masks.append(mask)
             if mask is None:
                 raise ValueError("No attention mask was provided")
@@ -2246,9 +2319,9 @@ def forward(
             _len_sources = len(sources)
             for level in range(_len_sources, self.config.num_feature_levels):
                 if level == _len_sources:
-                    source = self.input_proj[level](features[-1][0])
+                    source = self.input_proj_vision[level](vision_features[-1][0])
                 else:
-                    source = self.input_proj[level](sources[-1])
+                    source = self.input_proj_vision[level](sources[-1])
                 mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
                 pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
                 sources.append(source)
@@ -2257,7 +2330,7 @@ def forward(
 
         # Create queries
         query_embeds = None
-        if not self.config.two_stage:
+        if self.config.embedding_init_target or self.config.two_stage:
             query_embeds = self.query_position_embeddings.weight
 
         # Prepare encoder inputs (by flattening)
@@ -2288,26 +2361,35 @@ def forward(
         # Also provide spatial_shapes, level_start_index and valid_ratios
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
-                inputs_embeds=source_flatten,
-                attention_mask=mask_flatten,
-                position_embeddings=lvl_pos_embed_flatten,
+                vision_features=source_flatten,
+                vision_attention_mask=mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=position_ids,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=return_dict
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput):
+            encoder_outputs = GroundingDINOEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
+                attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None,
+                cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None,
+                cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None,
             )
 
         # Fifth, prepare decoder inputs
-        batch_size, _, num_channels = encoder_outputs[0].shape
         enc_outputs_class = None
         enc_outputs_coord_logits = None
         if self.config.two_stage:
@@ -2318,14 +2400,19 @@ def forward(
             # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            enc_outputs_class = self.decoder.class_embed[-1](
+                object_query_embedding, 
+                encoder_outputs[1], 
+                text_token_mask
+            )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
             delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
             enc_outputs_coord_logits = delta_bbox + output_proposals
 
             # only keep top scoring `config.two_stage_num_proposals` proposals
             topk = self.config.two_stage_num_proposals
-            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
             topk_coords_logits = torch.gather(
                 enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
             )
@@ -2333,27 +2420,31 @@ def forward(
             topk_coords_logits = topk_coords_logits.detach()
             reference_points = topk_coords_logits.sigmoid()
             init_reference_points = reference_points
-            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
-            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+            if query_embeds:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
         else:
-            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
-            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
-            target = target.unsqueeze(0).expand(batch_size, -1, -1)
-            reference_points = self.reference_points(query_embed).sigmoid()
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
             init_reference_points = reference_points
 
         decoder_outputs = self.decoder(
             inputs_embeds=target,
-            position_embeddings=query_embed,
-            encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=mask_flatten,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=text_token_mask,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
             level_start_index=level_start_index,
             valid_ratios=valid_ratios,
+            self_attn_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=return_dict
         )
 
         if not return_dict:
@@ -2396,14 +2487,11 @@ def __init__(self, config: GroundingDINOConfig):
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
-        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.class_embed = GroundingDINOContrastiveEmbedding(config)
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
-        prior_prob = 0.01
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
         nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
         nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
 

From fd6ba8768c694b815df31b2293bff7516847763d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 11 Sep 2023 23:40:10 -0300
Subject: [PATCH 014/252] Added all necessary layers, configurations and
 forward logic up to GroundingDINOModel

---
 .../configuration_grounding_dino.py           | 19 +++++++
 .../grounding_dino/modeling_grounding_dino.py | 52 +++++++++++--------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index bc43655df050ee..e413d43b55cd89 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -142,6 +142,14 @@ class GroundingDINOConfig(PretrainedConfig):
             The dropout ratio for the fusion module.
         embedding_init_target (`bool`, *optional*, defaults to `True`):
             Whether to initialize the target with Embedding weights.
+        query_dim (`int`, *optional*, defaults to 4):
+            The dimension of the query vector.
+        decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
+            Whether to share the bbox embedding between the decoder and the two-stage bbox generator.
+        two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
+            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
+        two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
+            Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
 
     Examples:
 
@@ -215,6 +223,10 @@ def __init__(
         fusion_droppath = 0.1,
         fusion_dropout = 0.0,
         embedding_init_target = True,
+        query_dim = 4,
+        decoder_bbox_embed_share = True,
+        two_stage_bbox_embed_share = False,
+        two_stage_class_embed_share = False,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -282,7 +294,14 @@ def __init__(
         # Fusion
         self.fusion_droppath = fusion_droppath
         self.fusion_dropout = fusion_dropout
+        # Others
         self.embedding_init_target = embedding_init_target
+        self.query_dim = query_dim
+        self.decoder_bbox_embed_share = decoder_bbox_embed_share
+        self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
+        if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
+            raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
+        self.two_stage_class_embed_share = two_stage_class_embed_share
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 35ed14fa6859bc..4c35a8cf4b7814 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1856,6 +1856,7 @@ def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
+        self.layer_norm = nn.LayerNorm(config.d_model)
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.reference_points_head = GroundingDINOMLPPredictionHead(
             config.query_dim // 2 * config.d_model,
@@ -2038,7 +2039,7 @@ def custom_forward(*inputs):
                     new_reference_points = new_reference_points.sigmoid()
                 reference_points = new_reference_points.detach()
 
-            intermediate += (hidden_states,)
+            intermediate += (self.layer_norm(hidden_states),)
             intermediate_reference_points += (reference_points,)
 
             if output_attentions:
@@ -2146,6 +2147,8 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.encoder_output_bbox_embed = None
+            self.encoder_output_class_embed = None
         else:
             self.reference_points = nn.Embedding(config.num_queries, 4)
 
@@ -2400,13 +2403,13 @@ def forward(
             # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.decoder.class_embed[-1](
+            enc_outputs_class = self.encoder_output_class_embed(
                 object_query_embedding, 
                 encoder_outputs[1], 
                 text_token_mask
             )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
-            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
             enc_outputs_coord_logits = delta_bbox + output_proposals
 
             # only keep top scoring `config.two_stage_num_proposals` proposals
@@ -2487,32 +2490,35 @@ def __init__(self, config: GroundingDINOConfig):
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
-        self.class_embed = GroundingDINOContrastiveEmbedding(config)
-        self.bbox_embed = GroundingDINOMLPPredictionHead(
+        _class_embed = GroundingDINOContrastiveEmbedding(config)
+        _bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
-        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
-        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
 
-        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
-        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
-        if config.with_box_refine:
-            self.class_embed = _get_clones(self.class_embed, num_pred)
-            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
-            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
-            # hack implementation for iterative bounding box refinement
-            self.model.decoder.bbox_embed = self.bbox_embed
+
+        if config.decoder_bbox_embed_share:
+            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
-            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
-            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
-            self.model.decoder.bbox_embed = None
+            self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
+        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
+        # hack implementation for two-stage 
+        self.model.decoder.bbox_embed = self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
+
         if config.two_stage:
-            # hack implementation for two-stage
-            self.model.decoder.class_embed = self.class_embed
-            for box_embed in self.bbox_embed:
-                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+            if config.two_stage_bbox_embed_share:
+                self.model.encoder_output_bbox_embed = _bbox_embed
+            else:
+                self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed)
+            
+            #TODO don't believe this is necessary since class_embed has no parameters
+            if config.two_stage_class_embed_share:
+                self.model.encoder_output_class_embed = _class_embed
+            else:
+                self.model.encoder_output_class_embed = copy.deepcopy(_class_embed)
 
         # Initialize weights and apply final processing
         self.post_init()

From dca093b25bffa2a13ccd2cc7d292601ef83a51a3 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 12 Sep 2023 00:16:28 -0300
Subject: [PATCH 015/252] Added all layers to convertion

---
 .../convert_grounding_dino_to_hf.py           | 101 ++++++++++--------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index efced9cba0d522..4c74404b19b288 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -66,72 +66,66 @@ def create_rename_keys(state_dict, config):
     #TODO names might change after modifing GroundingDINOModel class
     ########################################## VISION BACKBONE - START
     # patch embedding layer
-    rename_keys.append(("module.backbone.0.patch_embed.proj.weight", 
+    rename_keys.append(("backbone.0.patch_embed.proj.weight", 
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("module.backbone.0.patch_embed.proj.bias", 
+    rename_keys.append(("backbone.0.patch_embed.proj.bias", 
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("module.backbone.0.patch_embed.norm.weight", 
+    rename_keys.append(("backbone.0.patch_embed.norm.weight", 
                         "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("module.backbone.0.patch_embed.norm.bias", 
+    rename_keys.append(("backbone.0.patch_embed.norm.bias", 
                         "model.backbone.conv_encoder.model.embeddings.norm.bias"))
 
     for layer, depth in enumerate(config.backbone_config.depths):
         for block in range(depth):
             # layernorms
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
             
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
                                 f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
             #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
             # intermidiate
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
             
             # output
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
             
         # downsample
         if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
     
     for out_indice in config.backbone_config.out_indices:
         # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", 
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", 
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
         
     ########################################## VISION BACKBONE - END
 
-    ########################################## TEXT BACKBONE - START
-    for layer_name, params in state_dict.items():
-        if "module.bert" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
-    ########################################## TEXT BACKBONE - END
-
     ########################################## ENCODER - START
     deformable_key_mappings = {
         'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
@@ -185,23 +179,21 @@ def create_rename_keys(state_dict, config):
         'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
         'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
     }
-    
     for layer in range(config.encoder_layers):
         # deformable
         for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
         # text enhance
         for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
         # fusion layers
         for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
     ########################################## ENCODER - END
 
-    #TODO convert decoder
     ########################################## DECODER - START
     key_mappings_decoder = {
         'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
@@ -234,7 +226,7 @@ def create_rename_keys(state_dict, config):
         'norm3.bias': 'final_layer_norm.bias',
     }
     for layer_num in range(config.decoder_layers):
-        source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.'
+        source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.'
         target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
 
         for source_name, target_name in key_mappings_decoder.items():
@@ -246,17 +238,36 @@ def create_rename_keys(state_dict, config):
     ########################################## HEAD - START
     ########################################## HEAD - END
 
-    #TODO convert additional layers
     ########################################## Additional - START
     for layer_name, params in state_dict.items():
+        #### TEXT BACKBONE 
+        if "bert" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
         #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
-        if "module.input_proj" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision")))
-            #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
-        if "module.feat_map" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text")))
-    #### 
-
+        if "input_proj" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision")))
+        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
+        if "feat_map" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text")))
+        #### DECODER REFERENCE POINT HEAD
+        if "transformer.decoder.ref_point_head" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", 
+                                                               "model.decoder.reference_points_head")))
+        #### DECODER BBOX EMBED
+        if "transformer.decoder.bbox_embed" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", 
+                                                               "model.decoder.bbox_embed")))
+        if "transformer.enc_output" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
+        
+        if "transformer.enc_out_bbox_embed" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", 
+                                                               "model.encoder_output_bbox_embed")))
+            
+    rename_keys.append(("transformer.level_embed", "model.level_embed"))
+    rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
+    rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
+    rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight"))
     ########################################## Additional - END
 
     # fmt: on
@@ -274,8 +285,8 @@ def read_in_q_k_v(state_dict, config):
         hidden_size = embed_dim * 2**layer
         for block in range(depth):
             # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
+            in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
+            in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
             # next, add query, keys and values (in that order) to the state dict
             state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
             state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
@@ -382,7 +393,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
-    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")
 
     # Rename keys
     new_state_dict = original_state_dict.copy()
@@ -452,7 +463,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
     parser.add_argument(
         "--checkpoint_path",
-        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth",
+        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
         type=str,
         help="Path to the original PyTorch checkpoint (.pth file).",
     )

From cba79882fc3a78dffd432511171966f920937dc9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 12 Sep 2023 11:24:24 -0300
Subject: [PATCH 016/252] Fixed outputs for GroundingDINOModel and
 GroundingDINOForObjectDetection

---
 .../grounding_dino/modeling_grounding_dino.py | 156 +++++++++++++-----
 1 file changed, 113 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4c35a8cf4b7814..c3d094285dcf0d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -228,10 +228,9 @@ class GroundingDINOEncoderOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOModelOutput(ModelOutput):
     """
-    Base class for outputs of the Deformable DETR encoder-decoder model.
+    Base class for outputs of the Grounding DINO encoder-decoder model.
 
     Args:
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
@@ -250,25 +249,47 @@ class GroundingDINOModelOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
             num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
             average in the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+        encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
             layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
             foreground and background).
-        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
 
@@ -278,16 +299,21 @@ class GroundingDINOModelOutput(ModelOutput):
     intermediate_reference_points: torch.FloatTensor = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO
 class GroundingDINOObjectDetectionOutput(ModelOutput):
     """
     Output type of [`GroundingDINOForObjectDetection`].
@@ -320,20 +346,42 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
             num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
             average in the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+        encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
             layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
-            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
-            in the self-attention heads.
+        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -359,12 +407,18 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
     intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    enc_outputs_class: Optional = None
-    enc_outputs_coord_logits: Optional = None
+    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
 def _get_clones(module, N):
@@ -1988,8 +2042,11 @@ def forward(
             query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
             query_pos = self.reference_points_head(query_pos)
 
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # withtout layer norm
             if output_hidden_states:
-                all_hidden_states += (hidden_states,)
+                all_hidden_states += (self.layer_norm(hidden_states),)
 
             if self.gradient_checkpointing and self.training:
 
@@ -2055,6 +2112,7 @@ def custom_forward(*inputs):
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
         intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -2463,10 +2521,16 @@ def forward(
             intermediate_reference_points=decoder_outputs.intermediate_reference_points,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
+            decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions,
+            decoder_cross_attentions_text=decoder_outputs.text_cross_attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
+            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
+            encoder_attentions_vision=encoder_outputs.attentions_vision,
+            encoder_attentions_text=encoder_outputs.attentions_text,
+            encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision,
+            encoder_cross_attentions_text=encoder_outputs.cross_attentions_text,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
         )
@@ -2588,7 +2652,7 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
             pixel_values,
             pixel_mask=pixel_mask,
@@ -2688,10 +2752,16 @@ def forward(
             last_hidden_state=outputs.last_hidden_state,
             decoder_hidden_states=outputs.decoder_hidden_states,
             decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
+            decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision,
+            decoder_cross_attentions_text=outputs.decoder_cross_attentions_text,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
+            encoder_hidden_states_text=outputs.encoder_hidden_states_text,
+            encoder_attentions_vision=outputs.encoder_attentions_vision,
+            encoder_attentions_text=outputs.encoder_attentions_text,
+            encoder_cross_attentions_text=outputs.encoder_cross_attentions_text,
+            encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision,
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
             init_reference_points=outputs.init_reference_points,

From d47864e3ed02f0a25f22eb505bb09f18f7fe6db0 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 11:58:02 -0300
Subject: [PATCH 017/252] Fixed mask input to encoders and fixed
 nn.MultiheadAttention batch first and attn output

---
 .../convert_grounding_dino_to_hf.py           | 30 ++++-----
 .../grounding_dino/modeling_grounding_dino.py | 61 ++++++++++++-------
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 4c74404b19b288..15793a0df03ae7 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -385,7 +385,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         tokenized_for_encoder["attention_mask"] = text_self_attention_masks
         tokenized_for_encoder["position_ids"] = position_ids
 
-    return tokenized_for_encoder
+    return tokenized_for_encoder, tokenized.attention_mask.bool()
 
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
@@ -418,25 +418,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         ]
     )
     image_inputs = image_processor(image)
-    text_inputs = text_processor(text, config)
-
-    pixel_mask = torch.ones(
-        ((1, image_inputs.shape[1], image_inputs.shape[2])), 
-        dtype=torch.long, 
-        device=image_inputs.device
+    text_inputs, text_token_mask = text_processor(text, config)
+
+    outputs = model(
+        pixel_values=image_inputs.unsqueeze(0),
+        input_ids=text_inputs["input_ids"],
+        attention_mask=text_inputs["attention_mask"],
+        token_type_ids=text_inputs["token_type_ids"],
+        text_token_mask=text_token_mask,
+        text_self_attention_masks=text_inputs["attention_mask"],
+        position_ids=text_inputs["position_ids"],
     )
-    # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0))
-    output = model.model.text_backbone(**text_inputs)
-    print(output.last_hidden_state[:, :, :5])
-
-    # for feature_map in output.last_hidden_state:
-    #     print(f"{feature_map.shape}")
-    #     print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
-
-    # outputs = model(**inputs).logits
-
-    # print(outputs.keys())
-    # print("Looks ok!")
 
     # if pytorch_dump_folder_path is not None:
     #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index c3d094285dcf0d..2cc715b10cce4f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -970,7 +970,8 @@ def __init__(self, config):
         self.self_attn = nn.MultiheadAttention(
             embed_dim=config.d_model, 
             num_heads=config.encoder_attention_heads // 2, 
-            dropout=config.text_enhancer_dropout
+            dropout=config.text_enhancer_dropout,
+            batch_first=True,
             )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
@@ -999,7 +1000,13 @@ def forward(
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
-        attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)
+        attention_output, attention_weights = self.self_attn(
+            query=q, 
+            key=k, 
+            value=hidden_states, 
+            attn_mask=attention_masks,
+            average_attn_weights=False
+        )
 
         hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
@@ -1233,8 +1240,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
             vision_features, 
             text_features, 
-            attention_mask_vision=attention_mask_vision, 
-            attention_mask_text=attention_mask_text
+            vision_attention_mask=attention_mask_vision, 
+            text_attention_mask=attention_mask_text
         )
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
         text_features = text_features + self.drop_path(self.gamma_l * delta_t)
@@ -1448,6 +1455,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
+            batch_first=True
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1459,6 +1467,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
+            batch_first=True
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
@@ -1518,7 +1527,8 @@ def forward(
             query=self.with_pos_embed(hidden_states, position_embeddings),
             key=self.with_pos_embed(hidden_states, position_embeddings),
             value=hidden_states,
-            attn_mask=self_attn_mask
+            attn_mask=self_attn_mask,
+            average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1533,6 +1543,7 @@ def forward(
             key=text_encoder_hidden_states.transpose(0, 1),
             value=text_encoder_hidden_states.transpose(0, 1),
             attn_mask=text_encoder_attention_mask,
+            average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -2423,13 +2434,13 @@ def forward(
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 vision_features=source_flatten,
-                vision_attention_mask=mask_flatten,
+                vision_attention_mask=~mask_flatten,
                 vision_position_embedding=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
                 text_features=text_features,
-                text_attention_mask=text_token_mask,
+                text_attention_mask=~text_token_mask,
                 text_position_embedding=None,
                 text_self_attention_masks=text_self_attention_masks,
                 text_position_ids=position_ids,
@@ -2599,16 +2610,19 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.BoolTensor,
+        token_type_ids: torch.LongTensor,
+        text_token_mask: torch.BoolTensor,
+        text_self_attention_masks: torch.BoolTensor,
+        position_ids: torch.LongTensor,
+        pixel_mask: Optional[torch.BoolTensor]=None,
+        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None,
+        output_attentions: Optional[bool]=None,
+        output_hidden_states: Optional[bool]=None,
+        return_dict: Optional[bool]=None,
     ):
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
@@ -2654,12 +2668,15 @@ def forward(
 
         # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
-            pixel_values,
-            pixel_mask=pixel_mask,
-            decoder_attention_mask=decoder_attention_mask,
+            pixel_values=pixel_values ,
+            input_ids=input_ids ,
+            attention_mask=attention_mask ,
+            token_type_ids=token_type_ids ,
+            text_token_mask=text_token_mask ,
+            text_self_attention_masks=text_self_attention_masks ,
+            position_ids=position_ids ,
+            pixel_mask=pixel_mask ,
             encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,

From b9f9553010ccef357ea9e0cbb509aed59f5957f4 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 14:14:57 -0300
Subject: [PATCH 018/252] Fixed forward from GroundingDINOTextEnhancerLayer

---
 .../grounding_dino/modeling_grounding_dino.py | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2cc715b10cce4f..36822d53eaa9ab 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -975,16 +975,14 @@ def __init__(self, config):
             )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
-        self.dropout = nn.Dropout(config.text_enhancer_dropout)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
 
         self.layer_norm_before = nn.LayerNorm(config.d_model)
         self.layer_norm_after = nn.LayerNorm(config.d_model)
-        self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
-        self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
         self.activation = ACT2FN[config.activation_function]
         self.num_heads = config.encoder_attention_heads // 2
+        self.dropout = config.text_enhancer_dropout
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -995,7 +993,7 @@ def forward(
         attention_masks: Optional[Tensor] = None,
         position_embeddings: Optional[Tensor] = None,
     ):    # repeat attn mask
-        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]:
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
@@ -1007,13 +1005,18 @@ def forward(
             attn_mask=attention_masks,
             average_attn_weights=False
         )
+        attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + attention_output
+        residual = hidden_states
 
-        hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
         hidden_states = self.activation(self.fc1(hidden_states))
-        attention_output = self.fc2(self.dropout(hidden_states))
-        hidden_states = hidden_states + self.dropout2(attention_output)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + residual
         hidden_states = self.layer_norm_after(hidden_states)
+
         return hidden_states, attention_weights
     
 class GroundingDINOBiMultiHeadAttention(nn.Module):
@@ -1423,12 +1426,10 @@ def forward(
         )
 
         (text_features, text_enhanced_attn) = self.text_enhancer_layer(
-            hidden_states=text_features.transpose(0, 1),
+            hidden_states=text_features,
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(
-                text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None
-            ),
-        ).transpose(0, 1)
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None)
+        )
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,

From 35d6639cd226ff0e5b9f4deebce25dc9b3ade2ab Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 14:31:17 -0300
Subject: [PATCH 019/252] Fixed output bug with GroundingDINODeformableLayer

---
 .../models/grounding_dino/modeling_grounding_dino.py       | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 36822d53eaa9ab..e8e147cb00554a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1329,12 +1329,7 @@ def forward(
                 clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
+        return hidden_states, attn_weights
 
 def get_sine_pos_embed(
     pos_tensor: torch.Tensor,

From 23d9048ccca9b1c7210f9b45e78a4be3a8079b51 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 15 Sep 2023 18:57:37 -0300
Subject: [PATCH 020/252] Fixed bugs that prevent
 GroundingDINOForObjectDetection to run forward method

---
 .../configuration_grounding_dino.py           |  2 +-
 .../grounding_dino/modeling_grounding_dino.py | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e413d43b55cd89..3a62780362d834 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -204,7 +204,7 @@ def __init__(
         encoder_n_points=4,
         decoder_n_points=4,
         two_stage=True,
-        two_stage_num_proposals=300,
+        two_stage_num_proposals=900,
         with_box_refine=True,
         class_cost=1,
         bbox_cost=5,
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e8e147cb00554a..2e9d7d3d0de7f5 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1536,9 +1536,9 @@ def forward(
         # Cross-Attention Text
         hidden_states, text_cross_attn_weights = self.encoder_attn_text(
             query=self.with_pos_embed(hidden_states, position_embeddings),
-            key=text_encoder_hidden_states.transpose(0, 1),
-            value=text_encoder_hidden_states.transpose(0, 1),
-            attn_mask=text_encoder_attention_mask,
+            key=text_encoder_hidden_states,
+            value=text_encoder_hidden_states,
+            key_padding_mask=text_encoder_attention_mask,
             average_attn_weights=False
         )
 
@@ -1590,12 +1590,12 @@ def __init__(self, config):
     def forward(
             self, 
             vision_hidden_state: torch.FloatTensor, 
-            text_hiddend_state: torch.FloatTensor, 
+            text_hidden_state: torch.FloatTensor, 
             text_token_mask: torch.BoolTensor
         ) -> torch.FloatTensor:
 
 
-        output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2)
+        output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
         output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
 
         # padding to max_text_len
@@ -1867,7 +1867,7 @@ def forward(
                 text_position_embedding=text_position_embedding,
                 text_self_attention_masks=text_self_attention_masks,
                 text_position_ids=text_position_ids
-            )
+            )   
 
 
             if output_attentions:
@@ -2488,7 +2488,7 @@ def forward(
             topk_coords_logits = topk_coords_logits.detach()
             reference_points = topk_coords_logits.sigmoid()
             init_reference_points = reference_points
-            if query_embeds:
+            if query_embeds is not None:
                 target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
             else:
                 target = torch.gather(
@@ -2679,6 +2679,7 @@ def forward(
         )
 
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9]
         init_reference = outputs.init_reference_points if return_dict else outputs[0]
         inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
 
@@ -2692,7 +2693,11 @@ def forward(
             else:
                 reference = inter_references[:, level - 1]
             reference = inverse_sigmoid(reference)
-            outputs_class = self.class_embed[level](hidden_states[:, level])
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=text_token_mask
+                )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
                 outputs_coord_logits = delta_bbox + reference

From 038a63a4e49f74f958f2fa8f6761b0422377de52 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 22:37:58 -0300
Subject: [PATCH 021/252] Fixed attentions to be passed correctly

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2e9d7d3d0de7f5..edbab3773a4fcd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2504,7 +2504,7 @@ def forward(
             vision_encoder_hidden_states=encoder_outputs[0],
             vision_encoder_attention_mask=mask_flatten,
             text_encoder_hidden_states=encoder_outputs[1],
-            text_encoder_attention_mask=text_token_mask,
+            text_encoder_attention_mask=~text_token_mask,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
             level_start_index=level_start_index,

From e113630c5a9bcb59da14fd1793e47f56d2beb6e9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:46:17 -0300
Subject: [PATCH 022/252] Passing temperature arg when creating Sine position
 embedding

---
 .../models/grounding_dino/modeling_grounding_dino.py       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index edbab3773a4fcd..671092a234ee04 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -594,7 +594,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[in
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO
 class GroundingDINOSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
@@ -619,8 +618,8 @@ def forward(self, pixel_values, pixel_mask):
         x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
         if self.normalize:
             eps = 1e-6
-            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+            y_embed = y_embed  / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed  / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
@@ -662,7 +661,7 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True)
     elif config.position_embedding_type == "learned":
         position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
     else:

From 30af3a2953ff0a39cb0f0f38a575935cbbcc7aff Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:47:09 -0300
Subject: [PATCH 023/252] Removed copy comments

---
 .../models/grounding_dino/modeling_grounding_dino.py             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 671092a234ee04..000c3e1f23ff1f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -656,7 +656,6 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":

From baad9526352c3232e0f541ee707f952bd68c4071 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:48:03 -0300
Subject: [PATCH 024/252] Added temperature argument for position embedding

---
 .../models/grounding_dino/configuration_grounding_dino.py    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 3a62780362d834..e321782b197810 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -150,7 +150,8 @@ class GroundingDINOConfig(PretrainedConfig):
             Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
         two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
             Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
-
+        positional_embedding_temperature (`float`, *optional*, defaults to 20):
+            The temperature for Sine Positional Embedding that is used together with vision backbone.
     Examples:
 
     ```python
@@ -227,6 +228,7 @@ def __init__(
         decoder_bbox_embed_share = True,
         two_stage_bbox_embed_share = False,
         two_stage_class_embed_share = False,
+        positional_embedding_temperature = 20,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -302,6 +304,7 @@ def __init__(
         if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
             raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
         self.two_stage_class_embed_share = two_stage_class_embed_share
+        self.positional_embedding_temperature = positional_embedding_temperature
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property

From 6e37211690a8db92b4487fd356a089bb5214c6e0 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:48:36 -0300
Subject: [PATCH 025/252] Fixed typo when converting weigths to GroundingDINO
 vision backbone

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 15793a0df03ae7..3fe62356b8e7d9 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -84,7 +84,7 @@ def create_rename_keys(state_dict, config):
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
             
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
-                                f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
@@ -430,6 +430,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         position_ids=text_inputs["position_ids"],
     )
 
+    print("Finished")
+
     # if pytorch_dump_folder_path is not None:
     #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     #     model.save_pretrained(pytorch_dump_folder_path)

From 0db05e0547ee3f0d74f7aadbf97726a722e0163d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:31:38 -0300
Subject: [PATCH 026/252] Final modifications on modeling

---
 .../grounding_dino/modeling_grounding_dino.py      | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 000c3e1f23ff1f..92ccdb41bab011 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1005,9 +1005,9 @@ def forward(
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output
-        residual = hidden_states
-
         hidden_states = self.layer_norm_before(hidden_states)
+
+        residual = hidden_states
         hidden_states = self.activation(self.fc1(hidden_states))
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
@@ -1426,7 +1426,7 @@ def forward(
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,
-            attention_mask=key_padding_mask,
+            attention_mask=~key_padding_mask,
             position_embeddings=vision_position_embedding,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
@@ -1517,9 +1517,10 @@ def forward(
         residual = hidden_states
 
         # Self Attention
+        q = k = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, self_attn_weights = self.self_attn(
-            query=self.with_pos_embed(hidden_states, position_embeddings),
-            key=self.with_pos_embed(hidden_states, position_embeddings),
+            query=q,
+            key=k,
             value=hidden_states,
             attn_mask=self_attn_mask,
             average_attn_weights=False
@@ -1826,9 +1827,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        #TODO check if this is necessary according to original implementation
-        vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training)
-
         reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
 
         encoder_vision_states = () if output_hidden_states else None

From a1eba2e505247e27bdcd1499218b1226252abffb Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:41:35 -0300
Subject: [PATCH 027/252] Removed unnecessary class

---
 .../grounding_dino/modeling_grounding_dino.py | 119 ------------------
 1 file changed, 119 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 92ccdb41bab011..94090841784322 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -841,125 +841,6 @@ def forward(
 
         return output, attention_weights
 
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
-class GroundingDINOMultiheadAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper.
-
-    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        if self.head_dim * num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
-        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, target_len, embed_dim = hidden_states.size()
-        # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
-            hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
-
-        # get queries, keys and values
-        query_states = self.q_proj(hidden_states) * self.scaling
-        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
-        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
-
-        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        source_len = key_states.size(1)
-
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        # expand attention_mask
-        if attention_mask is not None:
-            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, target_len, source_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
-                    f" {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
-            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
-            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped
-
-# Repeting some code to avoid convert nn.MultiheadAttention later
 #TODO is this an approriate way to name this?
 class GroundingDINOTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""

From 9cf7c3a272200aa790809e71594e372c58ef8ec2 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:42:41 -0300
Subject: [PATCH 028/252] Fixed convert structure

---
 .../convert_grounding_dino_to_hf.py           | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 3fe62356b8e7d9..5dcaad277092ca 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -388,7 +388,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 
 @torch.no_grad()
-def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
+def convert_grounding_dino_checkpoint(
+    model_name: str, 
+    checkpoint_path: str, 
+    pytorch_dump_folder_path: str = None, 
+    push_to_hub: bool = False
+):
     #Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
@@ -420,6 +425,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     image_inputs = image_processor(image)
     text_inputs, text_token_mask = text_processor(text, config)
 
+    # Running forward
     outputs = model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
@@ -430,19 +436,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         position_ids=text_inputs["position_ids"],
     )
 
-    print("Finished")
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
 
-    # if pytorch_dump_folder_path is not None:
-    #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    #     model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
-    #     print(f"Saving image processor to {pytorch_dump_folder_path}")
-    #     image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # if push_to_hub:
-    #     print(f"Pushing model and image processor for {model_name} to hub")
-    #     model.push_to_hub(f"microsoft/{model_name}")
-    #     image_processor.push_to_hub(f"microsoft/{model_name}")
+    if push_to_hub:
+        print(f"Pushing model and image processor for {model_name} to hub")
+        model.push_to_hub(f"microsoft/{model_name}")
+        image_processor.push_to_hub(f"microsoft/{model_name}")
 
 
 if __name__ == "__main__":
@@ -469,4 +473,9 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
 
     args = parser.parse_args()
-    convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path)
\ No newline at end of file
+    convert_grounding_dino_checkpoint(
+        args.model_name, 
+        args.checkpoint_path, 
+        args.pytorch_dump_folder_path, 
+        args.push_to_hub
+    )
\ No newline at end of file

From 9c55b247442a99bf438927f3fa5799b225e14dd9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 24 Sep 2023 01:35:07 -0300
Subject: [PATCH 029/252] Added image processing

---
 .../image_processing_grounding_dino.py        | 967 ++++++++++++++++++
 1 file changed, 967 insertions(+)
 create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
new file mode 100644
index 00000000000000..1adf8e8e0dcd62
--- /dev/null
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -0,0 +1,967 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_coco_detection_annotations,
+    valid_images,
+)
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            size = int(round(max_size * min_original_size / max_original_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        return height, width
+
+    if width < height:
+        ow = size
+        oh = int(size * height / width)
+    else:
+        oh = size
+        ow = int(size * width / height)
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by GroundingDINO.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints[keep]
+
+    return new_target
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: Dict[str, Any],
+    orig_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`Dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`Tuple[int, int]`):
+            The original size of the input image.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class GroundingDINOImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Grounding DINO image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
+            the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            overridden by the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: Dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into Grounding DINO model.
+        """
+        target = prepare_coco_detection_annotation(
+            image, target, input_data_format=input_data_format
+        )
+
+        return target
+
+    def prepare(self, image, target):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
+            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
+            "does not return the image anymore.",
+        )
+        target = self.prepare_annotation(image, target)
+        return image, target
+    
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> Dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        max_size = None
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, max_size=max_size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_pad = self.do_pad if do_pad is None else do_pad
+
+        if do_resize is not None and size is None:
+            raise ValueError("Size and max_size must be specified if do_resize is True.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+            if annotations is not None:
+                annotations = [
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                    for annotation, image in zip(annotations, images)
+                ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            data = {"pixel_values": images}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS - TODO: add support for other frameworks
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`GroundingDINOForObjectDetection`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`GroundingDINOForObjectDetection`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if isinstance(target_sizes, List):
+            img_h = torch.Tensor([i[0] for i in target_sizes])
+            img_w = torch.Tensor([i[1] for i in target_sizes])
+        else:
+            img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results

From ae570bbdf31b249ee8d16fb4742864ec82f6aff3 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 24 Sep 2023 01:37:59 -0300
Subject: [PATCH 030/252] make fixup partially completed

---
 docs/source/en/tasks/object_detection.md      |   2 +-
 src/transformers/__init__.py                  |  32 +-
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   6 +-
 .../models/auto/feature_extraction_auto.py    |   1 -
 .../models/auto/image_processing_auto.py      |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../configuration_grounding_dino.py           |  35 +-
 .../convert_grounding_dino_to_hf.py           | 163 +++----
 .../grounding_dino/modeling_grounding_dino.py | 405 +++++++++---------
 .../processing_grounding_dino.py              |   0
 .../tokenization_grounding_dino.py            |   0
 src/transformers/utils/dummy_pt_objects.py    |  48 +--
 utils/check_repo.py                           |   1 +
 14 files changed, 347 insertions(+), 354 deletions(-)
 create mode 100644 src/transformers/models/grounding_dino/processing_grounding_dino.py
 create mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 4eab9e58fb27da..38498417c6fb77 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aa2f7837b4ce67..4ea2c3ace121ea 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -274,7 +274,6 @@
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
-    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
@@ -358,6 +357,7 @@
         "GPTSanJapaneseTokenizer",
     ],
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -1542,14 +1542,6 @@
             "DeformableDetrPreTrainedModel",
         ]
     )
-    _import_structure["models.grounding_dino"].extend(
-        [
-            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "GroundingDINOForObjectDetection",
-            "GroundingDINOModel",
-            "GroundingDINOPreTrainedModel",
-        ]
-    )
     _import_structure["models.deit"].extend(
         [
             "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1921,6 +1913,14 @@
             "GraphormerPreTrainedModel",
         ]
     )
+    _import_structure["models.grounding_dino"].extend(
+        [
+            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GroundingDINOForObjectDetection",
+            "GroundingDINOModel",
+            "GroundingDINOPreTrainedModel",
+        ]
+    )
     _import_structure["models.groupvit"].extend(
         [
             "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4338,7 +4338,6 @@
         DecisionTransformerConfig,
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
-    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.deprecated.mctct import (
         MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4414,6 +4413,7 @@
         GPTSanJapaneseTokenizer,
     )
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
@@ -5445,12 +5445,6 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
-        from .models.grounding_dino import (
-            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            GroundingDINOForObjectDetection,
-            GroundingDINOModel,
-            GroundingDINOPreTrainedModel,
-        )
         from .models.deit import (
             DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeiTForImageClassification,
@@ -5753,6 +5747,12 @@
             GraphormerModel,
             GraphormerPreTrainedModel,
         )
+        from .models.grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
         from .models.groupvit import (
             GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             GroupViTModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 376f9353608e56..32e022f6d1d873 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -59,7 +59,6 @@
     deberta_v2,
     decision_transformer,
     deformable_detr,
-    grounding_dino,
     deit,
     deprecated,
     deta,
@@ -98,6 +97,7 @@
     gptj,
     gptsan_japanese,
     graphormer,
+    grounding_dino,
     groupvit,
     herbert,
     hubert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index db5e5f86761b88..c60f2bd5aa0256 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -71,7 +71,6 @@
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("deformable_detr", "DeformableDetrConfig"),
-        ("grounding-dino", "GroundingDINOConfig"),
         ("deit", "DeiTConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
@@ -107,6 +106,7 @@
         ("gptj", "GPTJConfig"),
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
+        ("grounding-dino", "GroundingDINOConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
@@ -278,7 +278,6 @@
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -312,6 +311,7 @@
         ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -476,7 +476,6 @@
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
         ("deformable_detr", "Deformable DETR"),
-        ("grounding-dino", "Grounding DINO"),
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
         ("deta", "DETA"),
@@ -517,6 +516,7 @@
         ("gptj", "GPT-J"),
         ("gptsan-japanese", "GPTSAN-japanese"),
         ("graphormer", "Graphormer"),
+        ("grounding-dino", "Grounding DINO"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
         ("hubert", "Hubert"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 78a0686c4816b0..90ece37c657191 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -50,7 +50,6 @@
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
         ("data2vec-vision", "BeitFeatureExtractor"),
         ("deformable_detr", "DeformableDetrFeatureExtractor"),
-        ("grounding-dino", "GroundingDINOFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("dinat", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ec8bf20938fd7a..d6d722b3e0842b 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -53,7 +53,6 @@
         ("cvt", "ConvNextImageProcessor"),
         ("data2vec-vision", "BeitImageProcessor"),
         ("deformable_detr", "DeformableDetrImageProcessor"),
-        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("deit", "DeiTImageProcessor"),
         ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
@@ -67,6 +66,7 @@
         ("focalnet", "BitImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
+        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2c54349e8306b2..abfa4f0e50328c 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -69,7 +69,6 @@
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
-        ("grounding-dino", "GroundingDINOModel"),
         ("deit", "DeiTModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
@@ -104,6 +103,7 @@
         ("gptj", "GPTJModel"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
+        ("grounding-dino", "GroundingDINOModel"),
         ("groupvit", "GroupViTModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
@@ -620,9 +620,9 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
-        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
+        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e321782b197810..09b9c41f131964 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -26,11 +26,10 @@
 }
 
 
-
 class GroundingDINOConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate
-    a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
+    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a
+    Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
     [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
 
@@ -147,9 +146,11 @@ class GroundingDINOConfig(PretrainedConfig):
         decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
             Whether to share the bbox embedding between the decoder and the two-stage bbox generator.
         two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
-            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
+            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal
+            generation.
         two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
-            Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
+            Whether to share the class embedding between the two-stage bbox generator and the region proposal
+            generation.
         positional_embedding_temperature (`float`, *optional*, defaults to 20):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
     Examples:
@@ -217,18 +218,18 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
-        #other parameters
-        max_text_len = 256,
-        sub_sentence_present = True,
-        text_enhancer_dropout = 0.0,
-        fusion_droppath = 0.1,
-        fusion_dropout = 0.0,
-        embedding_init_target = True,
-        query_dim = 4,
-        decoder_bbox_embed_share = True,
-        two_stage_bbox_embed_share = False,
-        two_stage_class_embed_share = False,
-        positional_embedding_temperature = 20,
+        # other parameters
+        max_text_len=256,
+        sub_sentence_present=True,
+        text_enhancer_dropout=0.0,
+        fusion_droppath=0.1,
+        fusion_dropout=0.0,
+        embedding_init_target=True,
+        query_dim=4,
+        decoder_bbox_embed_share=True,
+        two_stage_bbox_embed_share=False,
+        two_stage_class_embed_share=False,
+        positional_embedding_temperature=20,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 5dcaad277092ca..4f2f3716329ed4 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 """Convert GroundingDINO SimMIM checkpoints from the original repository.
 
-URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
+URL:
+https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
 
 import argparse
 
@@ -22,11 +23,9 @@
 import torch
 from PIL import Image
 from torchvision import transforms as T
-import torchvision.transforms.functional as F
 
-from transformers import (
-    GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer
-)
+from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection
+
 
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
@@ -66,64 +65,64 @@ def create_rename_keys(state_dict, config):
     #TODO names might change after modifing GroundingDINOModel class
     ########################################## VISION BACKBONE - START
     # patch embedding layer
-    rename_keys.append(("backbone.0.patch_embed.proj.weight", 
+    rename_keys.append(("backbone.0.patch_embed.proj.weight",
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.patch_embed.proj.bias", 
+    rename_keys.append(("backbone.0.patch_embed.proj.bias",
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.patch_embed.norm.weight", 
+    rename_keys.append(("backbone.0.patch_embed.norm.weight",
                         "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.patch_embed.norm.bias", 
+    rename_keys.append(("backbone.0.patch_embed.norm.bias",
                         "model.backbone.conv_encoder.model.embeddings.norm.bias"))
 
     for layer, depth in enumerate(config.backbone_config.depths):
         for block in range(depth):
             # layernorms
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
-            
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index",
             #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
             # intermidiate
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
-            
+
             # output
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
-            
+
         # downsample
         if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
-    
+
     for out_indice in config.backbone_config.out_indices:
         # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight",
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias",
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
-        
+
     ########################################## VISION BACKBONE - END
 
     ########################################## ENCODER - START
@@ -182,15 +181,15 @@ def create_rename_keys(state_dict, config):
     for layer in range(config.encoder_layers):
         # deformable
         for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
         # text enhance
         for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
         # fusion layers
         for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
     ########################################## ENCODER - END
 
@@ -230,7 +229,7 @@ def create_rename_keys(state_dict, config):
         target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
 
         for source_name, target_name in key_mappings_decoder.items():
-            rename_keys.append((source_prefix_decoder + source_name, 
+            rename_keys.append((source_prefix_decoder + source_name,
                                target_prefix_decoder + target_name))
     ########################################## DECODER - END
 
@@ -240,7 +239,7 @@ def create_rename_keys(state_dict, config):
 
     ########################################## Additional - START
     for layer_name, params in state_dict.items():
-        #### TEXT BACKBONE 
+        #### TEXT BACKBONE
         if "bert" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
         #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
@@ -251,19 +250,19 @@ def create_rename_keys(state_dict, config):
             rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text")))
         #### DECODER REFERENCE POINT HEAD
         if "transformer.decoder.ref_point_head" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head",
                                                                "model.decoder.reference_points_head")))
         #### DECODER BBOX EMBED
         if "transformer.decoder.bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed",
                                                                "model.decoder.bbox_embed")))
         if "transformer.enc_output" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
-        
+
         if "transformer.enc_out_bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed",
                                                                "model.encoder_output_bbox_embed")))
-            
+
     rename_keys.append(("transformer.level_embed", "model.level_embed"))
     rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
     rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
@@ -273,10 +272,12 @@ def create_rename_keys(state_dict, config):
     # fmt: on
     return rename_keys
 
+
 def rename_key(dct, old, new):
     val = dct.pop(old)
     dct[new] = val
 
+
 # we split up the matrix of each encoder layer into queries, keys and values
 def read_in_q_k_v(state_dict, config):
     ########################################## VISION BACKBONE - START
@@ -288,14 +289,26 @@ def read_in_q_k_v(state_dict, config):
             in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
             in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
             # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
-
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"
+            ] = in_proj_weight[:hidden_size, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"
+            ] = in_proj_bias[:hidden_size]
+
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"
+            ] = in_proj_weight[hidden_size : hidden_size * 2, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"
+            ] = in_proj_bias[hidden_size : hidden_size * 2]
+
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"
+            ] = in_proj_weight[-hidden_size:, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"
+            ] = in_proj_bias[-hidden_size:]
     ########################################## VISION BACKBONE - END
 
 
@@ -305,12 +318,14 @@ def prepare_img():
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     return image
 
+
 def text_processor(text: str, config):
     def preprocess_caption(caption: str) -> str:
         result = caption.lower().strip()
         if result.endswith("."):
             return result
         return result + "."
+
     def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
         """Generate attention mask between each pair of special tokens
         Args:
@@ -330,9 +345,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         idxs = torch.nonzero(special_tokens_mask)
 
         # generate attention mask and positional ids
-        attention_mask = (
-            torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
-        )
+        attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
         position_ids = torch.zeros((bs, num_token), device=input_ids.device)
         cate_to_token_mask_list = [[] for _ in range(bs)]
         previous_col = 0
@@ -352,8 +365,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
             previous_col = col
 
         cate_to_token_mask_list = [
-            torch.stack(cate_to_token_mask_listi, dim=0)
-            for cate_to_token_mask_listi in cate_to_token_mask_list
+            torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list
         ]
 
         # # padding mask
@@ -361,23 +373,23 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
 
         return attention_mask, position_ids.to(torch.long)
+
     tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
     special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
     text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, special_tokens)
-    
+        tokenized, special_tokens
+    )
+
     max_text_len = config.max_text_len
     sub_sentence_present = config.sub_sentence_present
     if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[
-            :, : max_text_len, : max_text_len
-        ]
-        position_ids = position_ids[:, : max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len]
+        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+        position_ids = position_ids[:, :max_text_len]
+        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
 
     # extract text embeddings
     if sub_sentence_present:
@@ -387,14 +399,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 
+
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(
-    model_name: str, 
-    checkpoint_path: str, 
-    pytorch_dump_folder_path: str = None, 
-    push_to_hub: bool = False
+    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False
 ):
-    #Define default GroundingDINO configuation
+    # Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
@@ -403,7 +413,7 @@ def convert_grounding_dino_checkpoint(
     # Rename keys
     new_state_dict = original_state_dict.copy()
     rename_keys = create_rename_keys(original_state_dict, config)
-    
+
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)
@@ -416,17 +426,13 @@ def convert_grounding_dino_checkpoint(
     image = prepare_img()
     text = "a cat"
     image_processor = T.Compose(
-        [
-            T.Resize(size=800, max_size=1333),
-            T.ToTensor(), 
-            T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
-        ]
+        [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
     )
     image_inputs = image_processor(image)
     text_inputs, text_token_mask = text_processor(text, config)
 
     # Running forward
-    outputs = model(
+    model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
         attention_mask=text_inputs["attention_mask"],
@@ -474,8 +480,5 @@ def convert_grounding_dino_checkpoint(
 
     args = parser.parse_args()
     convert_grounding_dino_checkpoint(
-        args.model_name, 
-        args.checkpoint_path, 
-        args.pytorch_dump_folder_path, 
-        args.push_to_hub
-    )
\ No newline at end of file
+        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
+    )
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 94090841784322..69264d51b5e6b0 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -40,13 +40,11 @@
     requires_backends,
 )
 from ...modeling_outputs import (
-    BaseModelOutput, 
-    BaseModelOutputWithPoolingAndCrossAttentions, 
-    BaseModelOutputWithPastAndCrossAttentions
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...pytorch_utils import meshgrid
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_grounding_dino import GroundingDINOConfig
@@ -135,7 +133,6 @@ def backward(context, grad_output):
 ]
 
 
-
 @dataclass
 class GroundingDINODecoderOutput(ModelOutput):
     """
@@ -177,11 +174,11 @@ class GroundingDINODecoderOutput(ModelOutput):
     vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
     text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+
 @dataclass
 class GroundingDINOEncoderOutput(ModelOutput):
     """
-    Base class for outputs of the GroundingDINOEncoder. This class extends
-    BaseModelOutput, due to:
+    Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to:
     - vision and text last hidden states
     - vision and text intermediate hidden states
     - vision and text attentions
@@ -193,30 +190,31 @@ class GroundingDINOEncoderOutput(ModelOutput):
         last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the text encoder.
         hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer
-            plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer
-            plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
     """
+
     last_hidden_state_vision: torch.FloatTensor = None
     last_hidden_state_text: torch.FloatTensor = None
     hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
@@ -262,29 +260,29 @@ class GroundingDINOModelOutput(ModelOutput):
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
@@ -359,29 +357,29 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -618,8 +616,8 @@ def forward(self, pixel_values, pixel_mask):
         x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
         if self.normalize:
             eps = 1e-6
-            y_embed = y_embed  / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed  / (x_embed[:, :, -1:] + eps) * self.scale
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
@@ -660,7 +658,9 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True)
+        position_embedding = GroundingDINOSinePositionEmbedding(
+            n_steps, config.positional_embedding_temperature, normalize=True
+        )
     elif config.position_embedding_type == "learned":
         position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
     else:
@@ -841,17 +841,19 @@ def forward(
 
         return output, attention_weights
 
-#TODO is this an approriate way to name this?
+
+# TODO is this an approriate way to name this?
 class GroundingDINOTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
+
     def __init__(self, config):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(
-            embed_dim=config.d_model, 
-            num_heads=config.encoder_attention_heads // 2, 
+            embed_dim=config.d_model,
+            num_heads=config.encoder_attention_heads // 2,
             dropout=config.text_enhancer_dropout,
             batch_first=True,
-            )
+        )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
@@ -871,18 +873,14 @@ def forward(
         hidden_states: Tensor,
         attention_masks: Optional[Tensor] = None,
         position_embeddings: Optional[Tensor] = None,
-    ):    # repeat attn mask
+    ):  # repeat attn mask
         if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
         attention_output, attention_weights = self.self_attn(
-            query=q, 
-            key=k, 
-            value=hidden_states, 
-            attn_mask=attention_masks,
-            average_attn_weights=False
+            query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output
@@ -897,16 +895,10 @@ def forward(
         hidden_states = self.layer_norm_after(hidden_states)
 
         return hidden_states, attention_weights
-    
+
+
 class GroundingDINOBiMultiHeadAttention(nn.Module):
-    def __init__(
-            self,
-            vision_dim: int,
-            text_dim: int,
-            embed_dim: int,
-            num_heads: int,
-            dropout:float = 0.1
-        ):
+    def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1):
         super().__init__()
 
         self.embed_dim = embed_dim
@@ -949,12 +941,12 @@ def _reset_parameters(self):
         self.out_text_proj.bias.data.fill_(0)
 
     def forward(
-            self, 
-            vision_features: Tensor, 
-            text_features: Tensor, 
-            vision_attention_mask: Optional[Tensor] = None, 
-            text_attention_mask: Optional[Tensor] = None
-        ):
+        self,
+        vision_features: Tensor,
+        text_features: Tensor,
+        vision_attention_mask: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+    ):
         """_summary_
 
         Args:
@@ -1000,21 +992,21 @@ def forward(
         attn_weights = attn_weights - attn_weights.max()
 
         attn_weights = torch.clamp(
-                attn_weights, min=-50000
-            )  # Do not increase -50000, data type half has quite limited range
+            attn_weights, min=-50000
+        )  # Do not increase -50000, data type half has quite limited range
         attn_weights = torch.clamp(
-                attn_weights, max=50000
-            )  # Do not increase 50000, data type half has quite limited range
+            attn_weights, max=50000
+        )  # Do not increase 50000, data type half has quite limited range
 
         attn_weights_T = attn_weights.transpose(1, 2)
         text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
-        
+
         text_attn_weights = torch.clamp(
-                text_attn_weights, min=-50000
-            )  # Do not increase -50000, data type half has quite limited range
+            text_attn_weights, min=-50000
+        )  # Do not increase -50000, data type half has quite limited range
         text_attn_weights = torch.clamp(
-                text_attn_weights, max=50000
-            )  # Do not increase 50000, data type half has quite limited range
+            text_attn_weights, max=50000
+        )  # Do not increase 50000, data type half has quite limited range
 
         # mask vison for language
         if vision_attention_mask is not None:
@@ -1027,9 +1019,7 @@ def forward(
 
         # mask language for vision
         if text_attention_mask is not None:
-            text_attention_mask = (
-                text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
-            )
+            text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
             attn_weights.masked_fill_(text_attention_mask, float("-inf"))
         vision_attn_weights = attn_weights.softmax(dim=-1)
 
@@ -1062,6 +1052,7 @@ def forward(
 
         return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
 
+
 # Copied from transformers.models.beit.modeling_beit.drop_path
 def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
@@ -1082,6 +1073,7 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
     output = input.div(keep_prob) * random_tensor
     return output
 
+
 # Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO
 class GroundingDINODropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
@@ -1095,6 +1087,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
+
+
 class GroundingDINOFusionLayer(nn.Module):
     def __init__(self, config, init_values=1e-4):
         super().__init__()
@@ -1104,11 +1098,11 @@ def __init__(self, config, init_values=1e-4):
         self.layer_norm_vision = nn.LayerNorm(config.d_model)
         self.layer_norm_text = nn.LayerNorm(config.d_model)
         self.attn = GroundingDINOBiMultiHeadAttention(
-            vision_dim=config.d_model, 
-            text_dim=config.d_model, 
-            embed_dim=config.encoder_ffn_dim // 2, 
-            num_heads=config.encoder_attention_heads // 2, 
-            dropout=config.fusion_dropout
+            vision_dim=config.d_model,
+            text_dim=config.d_model,
+            embed_dim=config.encoder_ffn_dim // 2,
+            num_heads=config.encoder_attention_heads // 2,
+            dropout=config.fusion_dropout,
         )
 
         # add layer scale for training stability
@@ -1120,17 +1114,18 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         vision_features = self.layer_norm_vision(vision_features)
         text_features = self.layer_norm_text(text_features)
         (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
-            vision_features, 
-            text_features, 
-            vision_attention_mask=attention_mask_vision, 
-            text_attention_mask=attention_mask_text
+            vision_features,
+            text_features,
+            vision_attention_mask=attention_mask_vision,
+            text_attention_mask=attention_mask_text,
         )
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
         text_features = text_features + self.drop_path(self.gamma_l * delta_t)
 
         return (vision_features, vision_attn), (text_features, text_attn)
 
-#NOTE just renamed the class
+
+# NOTE just renamed the class
 class GroundingDINODeformableLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
@@ -1210,12 +1205,13 @@ def forward(
 
         return hidden_states, attn_weights
 
+
 def get_sine_pos_embed(
     pos_tensor: torch.Tensor,
     num_pos_feats: int = 128,
     temperature: int = 10000,
     exchange_xy: bool = True,
-    ) -> Tensor:
+) -> Tensor:
     """generate sine position embedding from a position tensor
     Args:
         pos_tensor (torch.Tensor): shape: [..., n].
@@ -1250,26 +1246,19 @@ def __init__(self, config) -> None:
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
     def get_text_position_embeddings(
-            self, 
-            text_features: Tensor, 
-            text_position_embedding: Tensor, 
-            text_position_ids: Tensor
-        ) -> Tensor:
+        self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor
+    ) -> Tensor:
         bs, n_text, text_dim = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
             text_position_embedding = (
-                torch.arange(n_text, device=text_features.device)
-                .float()
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .repeat(bs, 1, 1)
+                torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1)
             )
             text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
         if text_position_ids is not None:
             text_position_embedding = get_sine_pos_embed(
                 text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
             )
-        
+
         return text_position_embedding
 
     def forward(
@@ -1284,12 +1273,10 @@ def forward(
         text_attention_mask: Optional[Tensor] = None,
         text_position_embedding: Optional[Tensor] = None,
         text_self_attention_masks: Optional[Tensor] = None,
-        text_position_ids: Optional[Tensor] = None
+        text_position_ids: Optional[Tensor] = None,
     ):
         text_position_embedding = self.get_text_position_embeddings(
-            text_features, 
-            text_position_embedding, 
-            text_position_ids
+            text_features, text_position_embedding, text_position_ids
         )
 
         (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
@@ -1302,7 +1289,7 @@ def forward(
         (text_features, text_enhanced_attn) = self.text_enhancer_layer(
             hidden_states=text_features,
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(text_position_embedding if text_position_embedding is not None else None)
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None),
         )
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
@@ -1315,8 +1302,8 @@ def forward(
         )
 
         return (
-            (vision_features, text_features), 
-            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn)
+            (vision_features, text_features),
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn),
         )
 
 
@@ -1330,7 +1317,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            batch_first=True
+            batch_first=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1342,7 +1329,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            batch_first=True
+            batch_first=True,
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
@@ -1400,11 +1387,7 @@ def forward(
         # Self Attention
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, self_attn_weights = self.self_attn(
-            query=q,
-            key=k,
-            value=hidden_states,
-            attn_mask=self_attn_mask,
-            average_attn_weights=False
+            query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1419,7 +1402,7 @@ def forward(
             key=text_encoder_hidden_states,
             value=text_encoder_hidden_states,
             key_padding_mask=text_encoder_attention_mask,
-            average_attn_weights=False
+            average_attn_weights=False,
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1462,19 +1445,18 @@ def forward(
 
         return outputs
 
+
 class GroundingDINOContrastiveEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.max_text_len = config.max_text_len
 
     def forward(
-            self, 
-            vision_hidden_state: torch.FloatTensor, 
-            text_hidden_state: torch.FloatTensor, 
-            text_token_mask: torch.BoolTensor
-        ) -> torch.FloatTensor:
-
-
+        self,
+        vision_hidden_state: torch.FloatTensor,
+        text_hidden_state: torch.FloatTensor,
+        text_token_mask: torch.BoolTensor,
+    ) -> torch.FloatTensor:
         output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
         output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
 
@@ -1484,6 +1466,7 @@ def forward(
 
         return new_output
 
+
 # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
 class GroundingDINOClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
@@ -1503,30 +1486,29 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO
 class GroundingDINOPreTrainedModel(PreTrainedModel):
     config_class = GroundingDINOConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
     def _init_weights(self, module):
-        std = self.config.init_std
-
         if isinstance(module, GroundingDINOLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
             module._reset_parameters()
-        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GroundingDINOBiMultiHeadAttention):
+            module._reset_parameters()
+        elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        elif isinstance(module, GroundingDINOModel):
+            nn.init.constant_(module.input_proj_text.bias.data, 0)
+            nn.init.xavier_uniform_(module.input_proj_text.weight.data)
+            for proj in module.input_proj_vision:
+                nn.init.xavier_uniform_(proj[0].weight, gain=1)
+                nn.init.constant_(proj[0].bias, 0)
         if hasattr(module, "reference_points") and not self.config.two_stage:
             nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
             nn.init.constant_(module.reference_points.bias.data, 0.0)
@@ -1743,9 +1725,8 @@ def forward(
                 text_attention_mask=text_attention_mask,
                 text_position_embedding=text_position_embedding,
                 text_self_attention_masks=text_self_attention_masks,
-                text_position_ids=text_position_ids
-            )   
-
+                text_position_ids=text_position_ids,
+            )
 
             if output_attentions:
                 all_attn_fused_vision += (attentions[0],)
@@ -1759,9 +1740,12 @@ def forward(
 
         if not return_dict:
             enc_outputs = [
-                vision_features, text_features,
-                all_attn_fused_vision, all_attn_fused_text, 
-                all_attn_enhanced_text, all_attn_deformable
+                vision_features,
+                text_features,
+                all_attn_fused_vision,
+                all_attn_fused_text,
+                all_attn_enhanced_text,
+                all_attn_deformable,
             ]
             return tuple(v for v in enc_outputs if v is not None)
         return GroundingDINOEncoderOutput(
@@ -1772,9 +1756,10 @@ def forward(
             cross_attentions_vision=all_attn_fused_vision,
             cross_attentions_text=all_attn_fused_text,
             attentions_vision=all_attn_deformable,
-            attentions_text=all_attn_enhanced_text
+            attentions_text=all_attn_enhanced_text,
         )
 
+
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
@@ -1797,10 +1782,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.layer_norm = nn.LayerNorm(config.d_model)
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.reference_points_head = GroundingDINOMLPPredictionHead(
-            config.query_dim // 2 * config.d_model,
-            config.d_model,
-            config.d_model,
-            2
+            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
         )
         self.gradient_checkpointing = False
 
@@ -1826,7 +1808,7 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen
         # batch_size, num_queries, num_pos_feats
         pos_x = pos_x[:, :, None] / dim_t
         pos_y = pos_y[:, :, None] / dim_t
-        # batch_size, num_queries, num_pos_feats 
+        # batch_size, num_queries, num_pos_feats
         pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
         pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
 
@@ -1849,8 +1831,6 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen
             raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1)))
         return pos
 
-
-
     def forward(
         self,
         inputs_embeds,
@@ -1959,7 +1939,7 @@ def custom_forward(*inputs):
                     text_encoder_hidden_states=text_encoder_hidden_states,
                     text_encoder_attention_mask=text_encoder_attention_mask,
                     self_attn_mask=self_attn_mask,
-                    output_attentions=output_attentions
+                    output_attentions=output_attentions,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1992,7 +1972,6 @@ def custom_forward(*inputs):
                 if vision_encoder_hidden_states is not None:
                     all_cross_attns_vision += (layer_outputs[3],)
 
-
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
         intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
@@ -2012,7 +1991,7 @@ def custom_forward(*inputs):
                     all_hidden_states,
                     all_self_attns,
                     all_cross_attns_vision,
-                    all_cross_attns_text
+                    all_cross_attns_text,
                 ]
                 if v is not None
             )
@@ -2023,7 +2002,7 @@ def custom_forward(*inputs):
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             vision_cross_attentions=all_cross_attns_vision,
-            text_cross_attentions=all_cross_attns_text
+            text_cross_attentions=all_cross_attns_text,
         )
 
 
@@ -2075,7 +2054,7 @@ def __init__(self, config: GroundingDINOConfig):
             )
 
         # Create text backbone
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
+        self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config)
         self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
@@ -2199,7 +2178,7 @@ def forward(
         text_token_mask: Tensor,
         text_self_attention_masks: Tensor,
         position_ids: Tensor,
-        pixel_mask: Optional[Tensor]=None,
+        pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -2236,7 +2215,9 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"]
+        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[
+            "last_hidden_state"
+        ]
         text_features = self.input_proj_text(text_features)
 
         batch_size, num_channels, height, width = pixel_values.shape
@@ -2319,7 +2300,7 @@ def forward(
                 text_position_ids=position_ids,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict
+                return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput):
@@ -2346,9 +2327,7 @@ def forward(
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
             enc_outputs_class = self.encoder_output_class_embed(
-                object_query_embedding, 
-                encoder_outputs[1], 
-                text_token_mask
+                object_query_embedding, encoder_outputs[1], text_token_mask
             )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
             delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
@@ -2389,7 +2368,7 @@ def forward(
             self_attn_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict
+            return_dict=return_dict,
         )
 
         if not return_dict:
@@ -2422,8 +2401,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
-    top, for tasks such as COCO detection.
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
@@ -2446,13 +2425,12 @@ def __init__(self, config: GroundingDINOConfig):
         nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
         nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
 
-
         if config.decoder_bbox_embed_share:
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
             self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
-        # hack implementation for two-stage 
+        # hack implementation for two-stage
         self.model.decoder.bbox_embed = self.bbox_embed
         self.model.decoder.class_embed = self.class_embed
 
@@ -2461,8 +2439,8 @@ def __init__(self, config: GroundingDINOConfig):
                 self.model.encoder_output_bbox_embed = _bbox_embed
             else:
                 self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed)
-            
-            #TODO don't believe this is necessary since class_embed has no parameters
+
+            # TODO don't believe this is necessary since class_embed has no parameters
             if config.two_stage_class_embed_share:
                 self.model.encoder_output_class_embed = _class_embed
             else:
@@ -2490,12 +2468,12 @@ def forward(
         text_token_mask: torch.BoolTensor,
         text_self_attention_masks: torch.BoolTensor,
         position_ids: torch.LongTensor,
-        pixel_mask: Optional[torch.BoolTensor]=None,
-        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None,
-        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None,
-        output_attentions: Optional[bool]=None,
-        output_hidden_states: Optional[bool]=None,
-        return_dict: Optional[bool]=None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
@@ -2541,14 +2519,14 @@ def forward(
 
         # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
-            pixel_values=pixel_values ,
-            input_ids=input_ids ,
-            attention_mask=attention_mask ,
-            token_type_ids=token_type_ids ,
-            text_token_mask=text_token_mask ,
-            text_self_attention_masks=text_self_attention_masks ,
-            position_ids=position_ids ,
-            pixel_mask=pixel_mask ,
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            text_token_mask=text_token_mask,
+            text_self_attention_masks=text_self_attention_masks,
+            position_ids=position_ids,
+            pixel_mask=pixel_mask,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -2573,8 +2551,8 @@ def forward(
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],
                 text_hidden_state=enc_text_hidden_state,
-                text_token_mask=text_token_mask
-                )
+                text_token_mask=text_token_mask,
+            )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
                 outputs_coord_logits = delta_bbox + reference
@@ -3117,6 +3095,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
         raise ValueError("Only 3-dimensional tensors are supported")
     return NestedTensor(tensor, mask)
 
+
 # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText
 class GroundingDINOTextEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
@@ -3181,8 +3160,10 @@ def forward(
         embeddings = self.dropout(embeddings)
         return embeddings
 
+
 # Classes for Text Backbone (It's just a BERT model)
 
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
 class GroundingDINOTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3317,6 +3298,7 @@ def forward(
             outputs = outputs + (past_key_value,)
         return outputs
 
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText
 class GroundingDINOTextSelfOutput(nn.Module):
     def __init__(self, config):
@@ -3331,6 +3313,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText
 class GroundingDINOTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3380,6 +3363,7 @@ def forward(
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
+
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText
 class GroundingDINOTextIntermediate(nn.Module):
     def __init__(self, config):
@@ -3395,6 +3379,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText
 class GroundingDINOTextOutput(nn.Module):
     def __init__(self, config):
@@ -3409,6 +3394,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText
 class GroundingDINOTextLayer(nn.Module):
     def __init__(self, config):
@@ -3495,6 +3481,7 @@ def feed_forward_chunk(self, attention_output):
         layer_output = self.output(intermediate_output, attention_output)
         return layer_output
 
+
 # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText
 class GroundingDINOTextEncoder(nn.Module):
     def __init__(self, config):
@@ -3593,6 +3580,7 @@ def custom_forward(*inputs):
             cross_attentions=all_cross_attentions,
         )
 
+
 # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText
 class GroundingDINOTextPooler(nn.Module):
     def __init__(self, config):
@@ -3608,7 +3596,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         pooled_output = self.activation(pooled_output)
         return pooled_output
 
-class GroundingDINOTextModel(PreTrainedModel):
+
+class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f0bc1e774383b5..a36b75ce60a657 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2340,30 +2340,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class GroundingDINOForObjectDetection(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GroundingDINOModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GroundingDINOPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -3852,6 +3828,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GroundingDINOForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 98f2436ae3af45..8600226c8205eb 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -220,6 +220,7 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
+    "GroundingDINOTextPrenet",
     "InstructBlipVisionModel",
     "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",

From 1f6475f7c002ec44f6abc74d0abd08b150ecbf71 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 13:45:13 -0300
Subject: [PATCH 031/252] Now text_backbone_config has its own class

---
 .../configuration_grounding_dino.py           | 119 ++++++++++++++++--
 1 file changed, 111 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 09b9c41f131964..a3aa2b733d0474 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -25,6 +25,115 @@
     "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
+# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
+class GroundingDINOTextPrenetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a
+    [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or
+            [`TFGroundingDINOTextPrenetModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = GroundingDINOTextPrenetConfig()
+
+    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> model = GroundingDINOTextPrenetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "grounding-dino-text-prenet"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
 
 class GroundingDINOConfig(PretrainedConfig):
     r"""
@@ -177,7 +286,7 @@ def __init__(
         self,
         use_timm_backbone=False,
         backbone_config={"model_type": "swin"},
-        text_backbone_config="bert-base-uncased",
+        text_backbone_config=None,
         num_channels=3,
         num_queries=900,
         max_position_embeddings=1024,
@@ -187,15 +296,12 @@ def __init__(
         decoder_layers=6,
         decoder_ffn_dim=2048,
         decoder_attention_heads=8,
-        encoder_layerdrop=0.0,
         is_encoder_decoder=True,
         activation_function="relu",
         d_model=256,
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
-        init_std=0.02,
-        init_xavier_std=1.0,
         return_intermediate=True,
         auxiliary_loss=False,
         position_embedding_type="sine",
@@ -259,9 +365,6 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
         self.activation_function = activation_function
-        self.init_std = init_std
-        self.init_xavier_std = init_xavier_std
-        self.encoder_layerdrop = encoder_layerdrop
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
@@ -289,7 +392,7 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
+        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
         self.max_text_len = max_text_len
         self.sub_sentence_present = sub_sentence_present
         # Text Enhancer

From d763e0413bc8885f9421aaf8aab1f873079a876e Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 13:47:56 -0300
Subject: [PATCH 032/252] Modified convert script

---
 .../convert_grounding_dino_to_hf.py           | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 4f2f3716329ed4..29ad93f70ab536 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -374,7 +374,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
         return attention_mask, position_ids.to(torch.long)
 
-    tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer
     special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
@@ -401,12 +401,21 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
 
 @torch.no_grad()
-def convert_grounding_dino_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False
-):
+def convert_grounding_dino_checkpoint(args):
+
+    model_name = args.model_name
+    pytorch_dump_folder_path = args.pytorch_dump_folder_path
+    push_to_hub = args.push_to_hub
+
+    checkpoint_mapping = {
+        "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
+        "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
+    }
     # Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
+    checkpoint_path = checkpoint_mapping[model_name]
+
     # Load original checkpoint
     original_state_dict = torch.load(checkpoint_path, map_location="cpu")
 
@@ -432,7 +441,7 @@ def convert_grounding_dino_checkpoint(
     text_inputs, text_token_mask = text_processor(text, config)
 
     # Running forward
-    model(
+    output = model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
         attention_mask=text_inputs["attention_mask"],
@@ -451,8 +460,11 @@ def convert_grounding_dino_checkpoint(
 
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"microsoft/{model_name}")
-        image_processor.push_to_hub(f"microsoft/{model_name}")
+        model.push_to_hub(f"EduardoPacheco/{model_name}")
+        #TODO push image processor to hub
+        # image_processor.push_to_hub(f"microsoft/{model_name}")
+        #TODO push tokenizer to hub
+        #TODO push processor to hub
 
 
 if __name__ == "__main__":
@@ -460,17 +472,17 @@ def convert_grounding_dino_checkpoint(
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="grounding-dino-tiny",
+        default="grounding-dino-base",
         type=str,
         choices=["grounding-dino-tiny", "grounding-dino-base"],
         help="Name of the GroundingDINO model you'd like to convert.",
     )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
+    # parser.add_argument(
+    #     "--checkpoint_path",
+    #     default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
+    #     type=str,
+    #     help="Path to the original PyTorch checkpoint (.pth file).",
+    # )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -479,6 +491,4 @@ def convert_grounding_dino_checkpoint(
     )
 
     args = parser.parse_args()
-    convert_grounding_dino_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
+    convert_grounding_dino_checkpoint(args)

From 04022d4aa398501f692437a1cbb1a4a48dc2bcab Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 15:01:44 -0300
Subject: [PATCH 033/252] Removed unnecessary config attribute

---
 .../configuration_grounding_dino.py           |  2 --
 .../convert_grounding_dino_to_hf.py           | 21 ++++---------------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index a3aa2b733d0474..fbd0d483b48e45 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -326,7 +326,6 @@ def __init__(
         disable_custom_kernels=False,
         # other parameters
         max_text_len=256,
-        sub_sentence_present=True,
         text_enhancer_dropout=0.0,
         fusion_droppath=0.1,
         fusion_dropout=0.0,
@@ -394,7 +393,6 @@ def __init__(
         # Text backbone
         self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
         self.max_text_len = max_text_len
-        self.sub_sentence_present = sub_sentence_present
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
         # Fusion
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 29ad93f70ab536..ed16da3f0c4617 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -347,7 +347,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         # generate attention mask and positional ids
         attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
         position_ids = torch.zeros((bs, num_token), device=input_ids.device)
-        cate_to_token_mask_list = [[] for _ in range(bs)]
         previous_col = 0
         for i in range(idxs.shape[0]):
             row, col = idxs[i]
@@ -359,18 +358,8 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
                 position_ids[row, previous_col + 1 : col + 1] = torch.arange(
                     0, col - previous_col, device=input_ids.device
                 )
-                c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
-                c2t_maski[previous_col + 1 : col] = True
-                cate_to_token_mask_list[row].append(c2t_maski)
-            previous_col = col
-
-        cate_to_token_mask_list = [
-            torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list
-        ]
 
-        # # padding mask
-        # padding_mask = tokenized['attention_mask']
-        # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+            previous_col = col
 
         return attention_mask, position_ids.to(torch.long)
 
@@ -383,7 +372,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
     )
 
     max_text_len = config.max_text_len
-    sub_sentence_present = config.sub_sentence_present
     if text_self_attention_masks.shape[1] > max_text_len:
         text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
         position_ids = position_ids[:, :max_text_len]
@@ -392,10 +380,9 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
 
     # extract text embeddings
-    if sub_sentence_present:
-        tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
-        tokenized_for_encoder["attention_mask"] = text_self_attention_masks
-        tokenized_for_encoder["position_ids"] = position_ids
+    tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+    tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+    tokenized_for_encoder["position_ids"] = position_ids
 
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 

From 938f805a92a8a4c73aebce5938db0067736cda4f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:06:12 -0300
Subject: [PATCH 034/252] Added new function to generate sub sentence mask

---
 .../grounding_dino/modeling_grounding_dino.py | 76 +++++++++++++++----
 1 file changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 69264d51b5e6b0..d75db4735ad30a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -47,7 +47,7 @@
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
-from .configuration_grounding_dino import GroundingDINOConfig
+from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig
 from .load_custom import load_cuda_kernels
 
 
@@ -1923,9 +1923,16 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
                     vision_encoder_hidden_states,
                     vision_encoder_attention_mask,
-                    None,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -2005,6 +2012,42 @@ def custom_forward(*inputs):
             text_cross_attentions=all_cross_attns_text,
         )
 
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (torch.LongTensor): input ids. Shape: [bs, num_token]
+    Returns:
+        Tuple[torch.Tensor]: attention mask between each special tokens and position_ids
+    """
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
 
 @add_start_docstrings(
     """
@@ -2173,11 +2216,8 @@ def forward(
         self,
         pixel_values: Tensor,
         input_ids: Tensor,
-        attention_mask: Tensor,
         token_type_ids: Tensor,
-        text_token_mask: Tensor,
-        text_self_attention_masks: Tensor,
-        position_ids: Tensor,
+        attention_mask: Tensor,
         pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
@@ -2214,8 +2254,19 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+        text_token_mask = attention_mask.bool() # just to avoid renaming everywhere
+
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
+
         # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[
+        text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
             "last_hidden_state"
         ]
         text_features = self.input_proj_text(text_features)
@@ -2463,11 +2514,8 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         input_ids: torch.LongTensor,
-        attention_mask: torch.BoolTensor,
+        attention_mask: torch.LongTensor,
         token_type_ids: torch.LongTensor,
-        text_token_mask: torch.BoolTensor,
-        text_self_attention_masks: torch.BoolTensor,
-        position_ids: torch.LongTensor,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
         labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
@@ -2523,9 +2571,6 @@ def forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
-            text_token_mask=text_token_mask,
-            text_self_attention_masks=text_self_attention_masks,
-            position_ids=position_ids,
             pixel_mask=pixel_mask,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
@@ -2551,7 +2596,7 @@ def forward(
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],
                 text_hidden_state=enc_text_hidden_state,
-                text_token_mask=text_token_mask,
+                text_token_mask=attention_mask.bool(),
             )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
@@ -3609,6 +3654,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
     to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+    config_class = GroundingDINOTextPrenetConfig
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)

From 6f08b04abbf1c81d7f6f6da650b79e8bc0d70e31 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:26:14 -0300
Subject: [PATCH 035/252] Renamed parameters with gamma in the name as it's
 currently not allowed

---
 .../models/grounding_dino/modeling_grounding_dino.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d75db4735ad30a..71e7cb33fba0b9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1107,8 +1107,8 @@ def __init__(self, config, init_values=1e-4):
 
         # add layer scale for training stability
         self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
-        self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
@@ -1119,8 +1119,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
             vision_attention_mask=attention_mask_vision,
             text_attention_mask=attention_mask_text,
         )
-        vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
-        text_features = text_features + self.drop_path(self.gamma_l * delta_t)
+        vision_features = vision_features + self.drop_path(self.vision_param * delta_v)
+        text_features = text_features + self.drop_path(self.text_param * delta_t)
 
         return (vision_features, vision_attn), (text_features, text_attn)
 

From 7666253ac7b1ceba183ab2c55a31ef6713ca13d4 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:27:04 -0300
Subject: [PATCH 036/252] Removed tokenization and image_processing scripts
 since we'll map from existing models

---
 .../image_processing_grounding_dino.py        | 967 ------------------
 .../tokenization_grounding_dino.py            |   0
 2 files changed, 967 deletions(-)
 delete mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py
 delete mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
deleted file mode 100644
index 1adf8e8e0dcd62..00000000000000
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ /dev/null
@@ -1,967 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for Deformable DETR."""
-
-import io
-import pathlib
-from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
-
-import numpy as np
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils import BaseImageProcessor, get_size_dict
-from ...image_transforms import (
-    PaddingMode,
-    center_to_corners_format,
-    corners_to_center_format,
-    id_to_rgb,
-    pad,
-    rescale,
-    resize,
-    rgb_to_id,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    IMAGENET_DEFAULT_MEAN,
-    IMAGENET_DEFAULT_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_images,
-)
-from ...utils import (
-    ExplicitEnum,
-    TensorType,
-    is_flax_available,
-    is_jax_tensor,
-    is_scipy_available,
-    is_tf_available,
-    is_tf_tensor,
-    is_torch_available,
-    is_torch_tensor,
-    is_vision_available,
-    logging,
-)
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-
-if is_vision_available():
-    import PIL
-
-if is_scipy_available():
-    import scipy.special
-    import scipy.stats
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
-def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image size and the desired output size.
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
-            The desired output size.
-        max_size (`int`, *optional*):
-            The maximum allowed output size.
-    """
-    height, width = image_size
-    if max_size is not None:
-        min_original_size = float(min((height, width)))
-        max_original_size = float(max((height, width)))
-        if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
-
-    if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
-        ow = size
-        oh = int(size * height / width)
-    else:
-        oh = size
-        ow = int(size * width / height)
-    return (oh, ow)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
-def get_resize_output_image_size(
-    input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
-    max_size: Optional[int] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
-) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image size and the desired output size. If the desired output size
-    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
-    image size is computed by keeping the aspect ratio of the input image size.
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
-            The desired output size.
-        max_size (`int`, *optional*):
-            The maximum allowed output size.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
-            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
-    """
-    image_size = get_image_size(input_image, input_data_format)
-    if isinstance(size, (list, tuple)):
-        return size
-
-    return get_size_with_aspect_ratio(image_size, size, max_size)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
-def get_numpy_to_framework_fn(arr) -> Callable:
-    """
-    Returns a function that converts a numpy array to the framework of the input array.
-
-    Args:
-        arr (`np.ndarray`): The array to convert.
-    """
-    if isinstance(arr, np.ndarray):
-        return np.array
-    if is_tf_available() and is_tf_tensor(arr):
-        import tensorflow as tf
-
-        return tf.convert_to_tensor
-    if is_torch_available() and is_torch_tensor(arr):
-        import torch
-
-        return torch.tensor
-    if is_flax_available() and is_jax_tensor(arr):
-        import jax.numpy as jnp
-
-        return jnp.array
-    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
-
-
-# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
-def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
-    """
-    Squeezes an array, but only if the axis specified has dim 1.
-    """
-    if axis is None:
-        return arr.squeeze()
-
-    try:
-        return arr.squeeze(axis=axis)
-    except ValueError:
-        return arr
-
-
-# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
-def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
-    image_height, image_width = image_size
-    norm_annotation = {}
-    for key, value in annotation.items():
-        if key == "boxes":
-            boxes = value
-            boxes = corners_to_center_format(boxes)
-            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
-            norm_annotation[key] = boxes
-        else:
-            norm_annotation[key] = value
-    return norm_annotation
-
-
-# Copied from transformers.models.detr.image_processing_detr.max_across_indices
-def max_across_indices(values: Iterable[Any]) -> List[Any]:
-    """
-    Return the maximum value across all indices of an iterable of values.
-    """
-    return [max(values_i) for values_i in zip(*values)]
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(
-    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> List[int]:
-    """
-    Get the maximum height and width across all images in a batch.
-    """
-    if input_data_format is None:
-        input_data_format = infer_channel_dimension_format(images[0])
-
-    if input_data_format == ChannelDimension.FIRST:
-        _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_data_format == ChannelDimension.LAST:
-        max_height, max_width, _ = max_across_indices([img.shape for img in images])
-    else:
-        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
-    return (max_height, max_width)
-
-
-# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(
-    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> np.ndarray:
-    """
-    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
-
-    Args:
-        image (`np.ndarray`):
-            Image to make the pixel mask for.
-        output_size (`Tuple[int, int]`):
-            Output size of the mask.
-    """
-    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
-    mask = np.zeros(output_size, dtype=np.int64)
-    mask[:input_height, :input_width] = 1
-    return mask
-
-def prepare_coco_detection_annotation(
-    image,
-    target,
-    input_data_format: Optional[Union[ChannelDimension, str]] = None,
-):
-    """
-    Convert the target in COCO format into the format expected by GroundingDINO.
-    """
-    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
-
-    image_id = target["image_id"]
-    image_id = np.asarray([image_id], dtype=np.int64)
-
-    # Get all COCO annotations for the given image.
-    annotations = target["annotations"]
-    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
-
-    classes = [obj["category_id"] for obj in annotations]
-    classes = np.asarray(classes, dtype=np.int64)
-
-    # for conversion to coco api
-    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
-    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
-
-    boxes = [obj["bbox"] for obj in annotations]
-    # guard against no boxes via resizing
-    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
-    boxes[:, 2:] += boxes[:, :2]
-    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
-    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
-
-    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
-
-    new_target = {}
-    new_target["image_id"] = image_id
-    new_target["class_labels"] = classes[keep]
-    new_target["boxes"] = boxes[keep]
-    new_target["area"] = area[keep]
-    new_target["iscrowd"] = iscrowd[keep]
-    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
-
-    if annotations and "keypoints" in annotations[0]:
-        keypoints = [obj["keypoints"] for obj in annotations]
-        keypoints = np.asarray(keypoints, dtype=np.float32)
-        num_keypoints = keypoints.shape[0]
-        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
-
-    return new_target
-
-# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
-def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-    probs = scipy.special.softmax(logits, axis=-1)
-    labels = probs.argmax(-1, keepdims=True)
-    scores = np.take_along_axis(probs, labels, axis=-1)
-    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
-    return scores, labels
-
-# Copied from transformers.models.detr.image_processing_detr.resize_annotation
-def resize_annotation(
-    annotation: Dict[str, Any],
-    orig_size: Tuple[int, int],
-    target_size: Tuple[int, int],
-    threshold: float = 0.5,
-    resample: PILImageResampling = PILImageResampling.NEAREST,
-):
-    """
-    Resizes an annotation to a target size.
-
-    Args:
-        annotation (`Dict[str, Any]`):
-            The annotation dictionary.
-        orig_size (`Tuple[int, int]`):
-            The original size of the input image.
-        target_size (`Tuple[int, int]`):
-            The target size of the image, as returned by the preprocessing `resize` step.
-        threshold (`float`, *optional*, defaults to 0.5):
-            The threshold used to binarize the segmentation masks.
-        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
-            The resampling filter to use when resizing the masks.
-    """
-    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
-    ratio_height, ratio_width = ratios
-
-    new_annotation = {}
-    new_annotation["size"] = target_size
-
-    for key, value in annotation.items():
-        if key == "boxes":
-            boxes = value
-            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
-            new_annotation["boxes"] = scaled_boxes
-        elif key == "area":
-            area = value
-            scaled_area = area * (ratio_width * ratio_height)
-            new_annotation["area"] = scaled_area
-        elif key == "masks":
-            masks = value[:, None]
-            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
-            masks = masks.astype(np.float32)
-            masks = masks[:, 0] > threshold
-            new_annotation["masks"] = masks
-        elif key == "size":
-            new_annotation["size"] = target_size
-        else:
-            new_annotation[key] = value
-
-    return new_annotation
-
-
-class GroundingDINOImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Grounding DINO image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
-            overridden by the `do_resize` parameter in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
-            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
-            the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Resampling filter to use if resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
-            `do_rescale` parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
-            `preprocess` method.
-        do_normalize:
-            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
-            `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
-            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
-            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
-            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
-            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
-            overridden by the `do_pad` parameter in the `preprocess` method.
-    """
-
-    model_input_names = ["pixel_values", "pixel_mask"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Union[float, List[float]] = None,
-        image_std: Union[float, List[float]] = None,
-        do_pad: bool = True,
-        **kwargs,
-    ) -> None:
-        if "pad_and_return_pixel_mask" in kwargs:
-            do_pad = kwargs.pop("pad_and_return_pixel_mask")
-
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` parameter is deprecated and will be removed in v4.26. "
-                "Please specify in `size['longest_edge'] instead`.",
-            )
-            max_size = kwargs.pop("max_size")
-        else:
-            max_size = None if size is None else 1333
-
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
-        size = get_size_dict(size, max_size=max_size, default_to_square=False)
-
-        super().__init__(**kwargs)
-        self.format = format
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-        self.do_pad = do_pad
-
-    @classmethod
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
-        """
-        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
-        max_size=800)`
-        """
-        image_processor_dict = image_processor_dict.copy()
-        if "max_size" in kwargs:
-            image_processor_dict["max_size"] = kwargs.pop("max_size")
-        if "pad_and_return_pixel_mask" in kwargs:
-            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
-        return super().from_dict(image_processor_dict, **kwargs)
-
-    def prepare_annotation(
-        self,
-        image: np.ndarray,
-        target: Dict,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> Dict:
-        """
-        Prepare an annotation for feeding into Grounding DINO model.
-        """
-        target = prepare_coco_detection_annotation(
-            image, target, input_data_format=input_data_format
-        )
-
-        return target
-
-    def prepare(self, image, target):
-        logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a v4.33. "
-            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
-            "does not return the image anymore.",
-        )
-        target = self.prepare_annotation(image, target)
-        return image, target
-    
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
-        int, smaller edge of the image will be matched to this number.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
-                `height` and `width`.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use if resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` parameter is deprecated and will be removed in v4.26. "
-                "Please specify in `size['longest_edge'] instead`.",
-            )
-            max_size = kwargs.pop("max_size")
-        else:
-            max_size = None
-        size = get_size_dict(size, max_size=max_size, default_to_square=False)
-        if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(
-                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
-            )
-        elif "height" in size and "width" in size:
-            size = (size["height"], size["width"])
-        else:
-            raise ValueError(
-                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
-                f" {size.keys()}."
-            )
-        image = resize(
-            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
-        )
-        return image
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
-    def resize_annotation(
-        self,
-        annotation,
-        orig_size,
-        size,
-        resample: PILImageResampling = PILImageResampling.NEAREST,
-    ) -> Dict:
-        """
-        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
-        to this number.
-        """
-        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
-    def rescale(
-        self,
-        image: np.ndarray,
-        rescale_factor: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor. image = image * rescale_factor.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            rescale_factor (`float`):
-                The value to use for rescaling.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            input_data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
-                one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-        """
-        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
-    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
-        """
-        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
-        """
-        return normalize_annotation(annotation, image_size=image_size)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
-    def _pad_image(
-        self,
-        image: np.ndarray,
-        output_size: Tuple[int, int],
-        constant_values: Union[float, Iterable[float]] = 0,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.ndarray:
-        """
-        Pad an image with zeros to the given size.
-        """
-        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
-        output_height, output_width = output_size
-
-        pad_bottom = output_height - input_height
-        pad_right = output_width - input_width
-        padding = ((0, pad_bottom), (0, pad_right))
-        padded_image = pad(
-            image,
-            padding,
-            mode=PaddingMode.CONSTANT,
-            constant_values=constant_values,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        return padded_image
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
-    def pad(
-        self,
-        images: List[np.ndarray],
-        constant_values: Union[float, Iterable[float]] = 0,
-        return_pixel_mask: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
-        in the batch and optionally returns their corresponding pixel mask.
-
-        Args:
-            image (`np.ndarray`):
-                Image to pad.
-            constant_values (`float` or `Iterable[float]`, *optional*):
-                The value to use for the padding if `mode` is `"constant"`.
-            return_pixel_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return a pixel mask.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-
-        padded_images = [
-            self._pad_image(
-                image,
-                pad_size,
-                constant_values=constant_values,
-                data_format=data_format,
-                input_data_format=input_data_format,
-            )
-            for image in images
-        ]
-        data = {"pixel_values": padded_images}
-
-        if return_pixel_mask:
-            masks = [
-                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
-                for image in images
-            ]
-            data["pixel_mask"] = masks
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        resample=None,  # PILImageResampling
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[Union[int, float]] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_pad: Optional[bool] = None,
-        return_tensors: Optional[Union[TensorType, str]] = None,
-        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Preprocess an image or a batch of images so that it can be used by the model.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotation is for object
-                detection, the annotations should be a dictionary with the following keys:
-                - "image_id" (`int`): The image id.
-                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
-                  dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-                - "image_id" (`int`): The image id.
-                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                  An image can have no segments, in which case the list should be empty.
-                - "file_name" (`str`): The file name of the image.
-            do_resize (`bool`, *optional*, defaults to self.do_resize):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to self.size):
-                Size of the image after resizing.
-            resample (`PILImageResampling`, *optional*, defaults to self.resample):
-                Resampling filter to use when resizing the image.
-            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
-                Rescale factor to use when rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
-                Mean to use when normalizing the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
-                Standard deviation to use when normalizing the image.
-            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
-            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
-                Type of tensors to return. If `None`, will return the list of images.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        if "pad_and_return_pixel_mask" in kwargs:
-            logger.warning_once(
-                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead."
-            )
-            do_pad = kwargs.pop("pad_and_return_pixel_mask")
-
-        max_size = None
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead."
-            )
-            size = kwargs.pop("max_size")
-
-        do_resize = self.do_resize if do_resize is None else do_resize
-        size = self.size if size is None else size
-        size = get_size_dict(size=size, max_size=max_size, default_to_square=False)
-        resample = self.resample if resample is None else resample
-        do_rescale = self.do_rescale if do_rescale is None else do_rescale
-        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
-        do_normalize = self.do_normalize if do_normalize is None else do_normalize
-        image_mean = self.image_mean if image_mean is None else image_mean
-        image_std = self.image_std if image_std is None else image_std
-        do_pad = self.do_pad if do_pad is None else do_pad
-
-        if do_resize is not None and size is None:
-            raise ValueError("Size and max_size must be specified if do_resize is True.")
-
-        if do_rescale is not None and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize is not None and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        images = make_list_of_images(images)
-        if annotations is not None and isinstance(annotations, dict):
-            annotations = [annotations]
-
-        if annotations is not None and len(images) != len(annotations):
-            raise ValueError(
-                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
-            )
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if not valid_coco_detection_annotations(annotations):
-            raise ValueError(
-                "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
-                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                "being a list of annotations in the COCO format."
-            )
-
-        # All transformations expect numpy arrays
-        images = [to_numpy_array(image) for image in images]
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
-        if annotations is not None:
-            prepared_images = []
-            prepared_annotations = []
-            for image, target in zip(images, annotations):
-                target = self.prepare_annotation(
-                    image,
-                    target,
-                    input_data_format=input_data_format,
-                )
-                prepared_images.append(image)
-                prepared_annotations.append(target)
-            images = prepared_images
-            annotations = prepared_annotations
-            del prepared_images, prepared_annotations
-
-        # transformations
-        if do_resize:
-            if annotations is not None:
-                resized_images, resized_annotations = [], []
-                for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image, input_data_format)
-                    resized_image = self.resize(
-                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
-                    )
-                    resized_annotation = self.resize_annotation(
-                        target, orig_size, get_image_size(resized_image, input_data_format)
-                    )
-                    resized_images.append(resized_image)
-                    resized_annotations.append(resized_annotation)
-                images = resized_images
-                annotations = resized_annotations
-                del resized_images, resized_annotations
-            else:
-                images = [
-                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
-                    for image in images
-                ]
-
-        if do_rescale:
-            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
-
-        if do_normalize:
-            images = [
-                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
-            ]
-            if annotations is not None:
-                annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                    for annotation, image in zip(annotations, images)
-                ]
-
-        if do_pad:
-            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
-            )
-        else:
-            images = [
-                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-                for image in images
-            ]
-            data = {"pixel_values": images}
-
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
-        if annotations is not None:
-            encoded_inputs["labels"] = [
-                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            ]
-
-        return encoded_inputs
-
-    # POSTPROCESSING METHODS - TODO: add support for other frameworks
-    def post_process(self, outputs, target_sizes):
-        """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
-
-        Args:
-            outputs ([`GroundingDINOForObjectDetection`]):
-                Raw outputs of the model.
-            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
-                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
-                original image size (before any data augmentation). For visualization, this should be the image size
-                after data augment, but before padding.
-        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
-        """
-        logger.warning_once(
-            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
-        )
-
-        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
-
-        if len(out_logits) != len(target_sizes):
-            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
-        if target_sizes.shape[1] != 2:
-            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
-
-        prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
-        boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
-
-        # and from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
-        boxes = boxes * scale_fct[:, None, :]
-
-        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
-
-        return results
-
-    def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
-    ):
-        """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
-
-        Args:
-            outputs ([`GroundingDINOForObjectDetection`]):
-                Raw outputs of the model.
-            threshold (`float`, *optional*):
-                Score threshold to keep object detection predictions.
-            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
-            top_k (`int`, *optional*, defaults to 100):
-                Keep only top k bounding boxes before filtering by thresholding.
-
-        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
-        """
-        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
-
-        if target_sizes is not None:
-            if len(out_logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
-
-        prob = out_logits.sigmoid()
-        prob = prob.view(out_logits.shape[0], -1)
-        k_value = min(top_k, prob.size(1))
-        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
-        boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
-
-        # and from relative [0, 1] to absolute [0, height] coordinates
-        if isinstance(target_sizes, List):
-            img_h = torch.Tensor([i[0] for i in target_sizes])
-            img_w = torch.Tensor([i[1] for i in target_sizes])
-        else:
-            img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
-        boxes = boxes * scale_fct[:, None, :]
-
-        results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
-
-        return results
diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000

From 046e0c5ed5dd4e6edd5a29b56976e1ca318c5385 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:28:13 -0300
Subject: [PATCH 037/252] Fixed some issues with configuration

---
 .../configuration_grounding_dino.py           | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index fbd0d483b48e45..e900714852fbaa 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Grounding DINO model configuration"""
+import os
+from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -25,7 +27,7 @@
     "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
-# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
+# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
 class GroundingDINOTextPrenetConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a
@@ -134,6 +136,24 @@ def __init__(
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "grounding-dino":
+            config_dict = config_dict["text_backbone_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
 
 class GroundingDINOConfig(PretrainedConfig):
     r"""
@@ -289,7 +309,6 @@ def __init__(
         text_backbone_config=None,
         num_channels=3,
         num_queries=900,
-        max_position_embeddings=1024,
         encoder_layers=6,
         encoder_ffn_dim=2048,
         encoder_attention_heads=8,
@@ -352,7 +371,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.num_channels = num_channels
         self.num_queries = num_queries
-        self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
         self.encoder_layers = encoder_layers
@@ -391,7 +409,7 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
+        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config)
         self.max_text_len = max_text_len
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout

From 70b248dfd515ad27f6d81758ddaa9992096fed98 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:32:22 -0300
Subject: [PATCH 038/252] Just some modifications on conversion script

---
 .../convert_grounding_dino_to_hf.py           | 89 ++++---------------
 1 file changed, 18 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index ed16da3f0c4617..680c3872bf68dc 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -159,8 +159,8 @@ def create_rename_keys(state_dict, config):
         'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
     }
     fusion_key_mappings = {
-        'gamma_v': 'fusion_layer.gamma_v',
-        'gamma_l': 'fusion_layer.gamma_l',
+        'gamma_v': 'fusion_layer.vision_param',
+        'gamma_l': 'fusion_layer.text_param',
         'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
         'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
         'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
@@ -326,66 +326,11 @@ def preprocess_caption(caption: str) -> str:
             return result
         return result + "."
 
-    def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
-        """Generate attention mask between each pair of special tokens
-        Args:
-            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
-            special_tokens_mask (list): special tokens mask.
-        Returns:
-            torch.Tensor: attention mask between each special tokens.
-        """
-        input_ids = tokenized["input_ids"]
-        bs, num_token = input_ids.shape
-        # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
-        special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
-        for special_token in special_tokens_list:
-            special_tokens_mask |= input_ids == special_token
-
-        # idxs: each row is a list of indices of special tokens
-        idxs = torch.nonzero(special_tokens_mask)
-
-        # generate attention mask and positional ids
-        attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
-        position_ids = torch.zeros((bs, num_token), device=input_ids.device)
-        previous_col = 0
-        for i in range(idxs.shape[0]):
-            row, col = idxs[i]
-            if (col == 0) or (col == num_token - 1):
-                attention_mask[row, col, col] = True
-                position_ids[row, col] = 0
-            else:
-                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
-                position_ids[row, previous_col + 1 : col + 1] = torch.arange(
-                    0, col - previous_col, device=input_ids.device
-                )
-
-            previous_col = col
-
-        return attention_mask, position_ids.to(torch.long)
-
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer
-    special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
-    text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, special_tokens
-    )
-
-    max_text_len = config.max_text_len
-    if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-        position_ids = position_ids[:, :max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
-
-    # extract text embeddings
-    tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
-    tokenized_for_encoder["attention_mask"] = text_self_attention_masks
-    tokenized_for_encoder["position_ids"] = position_ids
-
-    return tokenized_for_encoder, tokenized.attention_mask.bool()
 
+    return tokenized
 
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(args):
@@ -415,7 +360,8 @@ def convert_grounding_dino_checkpoint(args):
     read_in_q_k_v(new_state_dict, config)
 
     # Load HF implementation with default config and converted state dict
-    model = GroundingDINOForObjectDetection(config).eval()
+    model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval()
+    # model = GroundingDINOForObjectDetection(config=config).eval()
     model.load_state_dict(new_state_dict, strict=False)
 
     # Load and process test image
@@ -425,19 +371,24 @@ def convert_grounding_dino_checkpoint(args):
         [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
     )
     image_inputs = image_processor(image)
-    text_inputs, text_token_mask = text_processor(text, config)
+    text_inputs = text_processor(text, config)
 
     # Running forward
     output = model(
         pixel_values=image_inputs.unsqueeze(0),
-        input_ids=text_inputs["input_ids"],
-        attention_mask=text_inputs["attention_mask"],
-        token_type_ids=text_inputs["token_type_ids"],
-        text_token_mask=text_token_mask,
-        text_self_attention_masks=text_inputs["attention_mask"],
-        position_ids=text_inputs["position_ids"],
+        **text_inputs
     )
 
+    # output.pred_boxes[:, :3, :]
+    # tensor([[[0.7674, 0.4136, 0.4572, 0.7305],
+    #      [0.2566, 0.5463, 0.4760, 0.8777],
+    #      [0.2585, 0.5442, 0.4640, 0.8683]]])
+    #
+    # output.logits[:, :3, :4]
+    # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374],
+    #      [-4.9652, -0.3719, -0.3950, -4.2315],
+    #      [-5.9599, -3.3765, -3.3104, -5.9752]]])
+
     if pytorch_dump_folder_path is not None:
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
@@ -448,10 +399,6 @@ def convert_grounding_dino_checkpoint(args):
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"EduardoPacheco/{model_name}")
-        #TODO push image processor to hub
-        # image_processor.push_to_hub(f"microsoft/{model_name}")
-        #TODO push tokenizer to hub
-        #TODO push processor to hub
 
 
 if __name__ == "__main__":
@@ -459,7 +406,7 @@ def convert_grounding_dino_checkpoint(args):
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="grounding-dino-base",
+        default="grounding-dino-tiny",
         type=str,
         choices=["grounding-dino-tiny", "grounding-dino-base"],
         help="Name of the GroundingDINO model you'd like to convert.",

From 3bc92b7688d531b8bd7e2ddf9708b08d6144fee6 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:33:52 -0300
Subject: [PATCH 039/252] Other modifications

---
 src/transformers/__init__.py                       | 4 ++--
 src/transformers/models/grounding_dino/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4ea2c3ace121ea..1775754773a314 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -357,7 +357,7 @@
         "GPTSanJapaneseTokenizer",
     ],
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
-    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -4413,7 +4413,7 @@
         GPTSanJapaneseTokenizer,
     )
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
-    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index e3767e017d1023..df2b0d907f1b65 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -18,7 +18,7 @@
 
 
 _import_structure = {
-    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
 }
 
 try:
@@ -36,7 +36,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
 
     try:
         if not is_torch_available():

From 4cae0ca71fa0564e86b1b448359ca2bc5a5e924c Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 22 Aug 2023 18:50:32 -0300
Subject: [PATCH 040/252] Copied deformable detr

---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/model_doc/grounding-dino.md    |   48 +
 src/transformers/__init__.py                  |   16 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    2 +
 .../models/grounding_dino/__init__.py         |   57 +
 .../configuration_grounding_dino.py           |  262 ++
 .../convert_grounding_dino_to_pytorch.py      |  237 ++
 .../models/grounding_dino/load_custom.py      |   49 +
 .../grounding_dino/modeling_grounding_dino.py | 2513 +++++++++++++++++
 tests/models/grounding_dino/__init__.py       |    0
 .../test_modeling_grounding_dino.py           |  673 +++++
 15 files changed, 3865 insertions(+)
 create mode 100644 docs/source/en/model_doc/grounding-dino.md
 create mode 100644 src/transformers/models/grounding_dino/__init__.py
 create mode 100644 src/transformers/models/grounding_dino/configuration_grounding_dino.py
 create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
 create mode 100644 src/transformers/models/grounding_dino/load_custom.py
 create mode 100644 src/transformers/models/grounding_dino/modeling_grounding_dino.py
 create mode 100644 tests/models/grounding_dino/__init__.py
 create mode 100644 tests/models/grounding_dino/test_modeling_grounding_dino.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7fc6ebf7d851b1..b80f2f093699a5 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -523,6 +523,8 @@
         title: FocalNet
       - local: model_doc/glpn
         title: GLPN
+      - local: model_doc/grounding-dino
+        title: Grounding DINO
       - local: model_doc/imagegpt
         title: ImageGPT
       - local: model_doc/levit
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
new file mode 100644
index 00000000000000..161a90609174b3
--- /dev/null
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -0,0 +1,48 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Grounding DINO
+
+## Overview
+
+The Grounding DINO model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## GroundingDINOConfig
+
+[[autodoc]] GroundingDINOConfig
+
+## GroundingDINOModel
+
+[[autodoc]] GroundingDINOModel
+    - forward
+
+## GroundingDINOForObjectDetection
+
+[[autodoc]] GroundingDINOForObjectDetection
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a68a492676eac5..ff461296c5e76e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -275,6 +275,7 @@
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
@@ -1591,6 +1592,14 @@
             "DeformableDetrPreTrainedModel",
         ]
     )
+    _import_structure["models.grounding_dino"].extend(
+        [
+            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GroundingDINOForObjectDetection",
+            "GroundingDINOModel",
+            "GroundingDINOPreTrainedModel",
+        ]
+    )
     _import_structure["models.deit"].extend(
         [
             "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4428,6 +4437,7 @@
         DecisionTransformerConfig,
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.deprecated.mctct import (
         MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5583,6 +5593,12 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
+        from .models.grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
         from .models.deit import (
             DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeiTForImageClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index b4486039b989da..cf718e4453f79d 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -61,6 +61,7 @@
     deberta_v2,
     decision_transformer,
     deformable_detr,
+    grounding_dino,
     deit,
     deprecated,
     deta,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5690359643c8e8..ca005bbc79df90 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -73,6 +73,7 @@
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("deformable_detr", "DeformableDetrConfig"),
+        ("grounding-dino", "GroundingDINOConfig"),
         ("deit", "DeiTConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
@@ -287,6 +288,7 @@
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -492,6 +494,7 @@
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
         ("deformable_detr", "Deformable DETR"),
+        ("grounding-dino", "Grounding DINO"),
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
         ("deta", "DETA"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index befca6a64b81b7..5bc4db87f7048b 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -50,6 +50,7 @@
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
         ("data2vec-vision", "BeitFeatureExtractor"),
         ("deformable_detr", "DeformableDetrFeatureExtractor"),
+        ("grounding-dino", "GroundingDINOFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("dinat", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 13bb3a6e5d8a8f..a791255829287d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -53,6 +53,7 @@
         ("cvt", "ConvNextImageProcessor"),
         ("data2vec-vision", "BeitImageProcessor"),
         ("deformable_detr", "DeformableDetrImageProcessor"),
+        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("deit", "DeiTImageProcessor"),
         ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bbbaa58d6ec0e6..842af5c5272abc 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -71,6 +71,7 @@
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
+        ("grounding-dino", "GroundingDINOModel"),
         ("deit", "DeiTModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
@@ -629,6 +630,7 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
new file mode 100644
index 00000000000000..e3767e017d1023
--- /dev/null
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_grounding_dino"] = [
+        "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GroundingDINOForObjectDetection",
+        "GroundingDINOModel",
+        "GroundingDINOPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
new file mode 100644
index 00000000000000..0b3ae3d74d3475
--- /dev/null
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Grounding DINO model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
+}
+
+
+
+class GroundingDINOConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate
+    a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Grounding DINO
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
+            `two_stage_num_proposals` instead.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        num_feature_levels (`int`, *optional*, defaults to 4):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `False`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `False`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GroundingDINOConfig, GroundingDINOModel
+
+    >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration
+    >>> configuration = GroundingDINOConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = GroundingDINOModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "grounding-dino"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
+        num_queries=300,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=1024,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        return_intermediate=True,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        use_pretrained_backbone=True,
+        dilation=False,
+        num_feature_levels=4,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=False,
+        two_stage_num_proposals=300,
+        with_box_refine=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        disable_custom_kernels=False,
+        **kwargs,
+    ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.dilation = dilation
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
new file mode 100644
index 00000000000000..d3cef0366b2bca
--- /dev/null
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Grounding DINO checkpoints."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import cached_download, hf_hub_url
+from PIL import Image
+
+from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def rename_key(orig_key):
+    if "backbone.0.body" in orig_key:
+        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
+    if "transformer" in orig_key:
+        orig_key = orig_key.replace("transformer.", "")
+    if "norm1" in orig_key:
+        if "encoder" in orig_key:
+            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
+        else:
+            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
+    if "norm2" in orig_key:
+        if "encoder" in orig_key:
+            orig_key = orig_key.replace("norm2", "final_layer_norm")
+        else:
+            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
+    if "norm3" in orig_key:
+        orig_key = orig_key.replace("norm3", "final_layer_norm")
+    if "linear1" in orig_key:
+        orig_key = orig_key.replace("linear1", "fc1")
+    if "linear2" in orig_key:
+        orig_key = orig_key.replace("linear2", "fc2")
+    if "query_embed" in orig_key:
+        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
+    if "cross_attn" in orig_key:
+        orig_key = orig_key.replace("cross_attn", "encoder_attn")
+
+    return orig_key
+
+
+def read_in_q_k_v(state_dict):
+    # transformer decoder self-attention layers
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_grounding_dino_checkpoint(
+    checkpoint_path,
+    single_scale,
+    dilation,
+    with_box_refine,
+    two_stage,
+    pytorch_dump_folder_path,
+    push_to_hub,
+):
+    """
+    Copy/paste/tweak model's weights to our Grounding DINO structure.
+    """
+
+    # load default config
+    config = GroundingDINOConfig()
+    # set config attributes
+    if single_scale:
+        config.num_feature_levels = 1
+    config.dilation = dilation
+    config.with_box_refine = with_box_refine
+    config.two_stage = two_stage
+    # set labels
+    config.num_labels = 91
+    repo_id = "huggingface/label-files"
+    filename = "coco-detection-id2label.json"
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    # load image processor
+    image_processor = DeformableDetrImageProcessor(format="coco_detection")
+
+    # prepare image
+    img = prepare_img()
+    encoding = image_processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    logger.info("Converting model...")
+
+    # load original state dict
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "model."
+    for key in state_dict.copy().keys():
+        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
+            val = state_dict.pop(key)
+            state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = GroundingDINOForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    # verify our conversion
+    outputs = model(pixel_values.to(device))
+
+    expected_logits = torch.tensor(
+        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
+    )
+    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
+
+    if single_scale:
+        expected_logits = torch.tensor(
+            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+        )
+        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
+
+    if single_scale and dilation:
+        expected_logits = torch.tensor(
+            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
+        )
+        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
+
+    if with_box_refine:
+        expected_logits = torch.tensor(
+            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
+        )
+        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
+
+    if with_box_refine and two_stage:
+        expected_logits = torch.tensor(
+            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
+        )
+        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
+
+    print("Logits:", outputs.logits[0, :3, :3])
+
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
+
+    print("Everything ok!")
+
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    # Push to hub
+    if push_to_hub:
+        model_name = "deformable-detr"
+        model_name += "-single-scale" if single_scale else ""
+        model_name += "-dc5" if dilation else ""
+        model_name += "-with-box-refine" if with_box_refine else ""
+        model_name += "-two-stage" if two_stage else ""
+        print("Pushing model to hub...")
+        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth",
+        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
+    )
+    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
+    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
+    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
+    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to output PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_grounding_dino_checkpoint(
+        args.checkpoint_path,
+        args.single_scale,
+        args.dilation,
+        args.with_box_refine,
+        args.two_stage,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py
new file mode 100644
index 00000000000000..97b8f09fb5f446
--- /dev/null
+++ b/src/transformers/models/grounding_dino/load_custom.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Loading of Grounding DINO's CUDA kernels"""
+import os
+from pathlib import Path
+
+
+def load_cuda_kernels():
+    from torch.utils.cpp_extension import load
+
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
+    src_files = [
+        root / filename
+        for filename in [
+            "vision.cpp",
+            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+        ]
+    ]
+
+    load(
+        "MultiScaleDeformableAttention",
+        src_files,
+        with_cuda=True,
+        extra_include_paths=[str(root)],
+        extra_cflags=["-DWITH_CUDA=1"],
+        extra_cuda_cflags=[
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ],
+    )
+
+    import MultiScaleDeformableAttention as MSDA
+
+    return MSDA
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
new file mode 100644
index 00000000000000..ee80a562e4b851
--- /dev/null
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -0,0 +1,2513 @@
+# coding=utf-8
+# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Grounding DINO model."""
+
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_timm_available,
+    is_torch_cuda_available,
+    is_vision_available,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import is_ninja_available, logging
+from ..auto import AutoBackbone
+from .configuration_grounding_dino import GroundingDINOConfig
+from .load_custom import load_cuda_kernels
+
+
+logger = logging.get_logger(__name__)
+
+# Move this to not compile only when importing, this needs to happen later, like in __init__.
+if is_torch_cuda_available() and is_ninja_available():
+    logger.info("Loading custom CUDA kernels...")
+    try:
+        MultiScaleDeformableAttention = load_cuda_kernels()
+    except Exception as e:
+        logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        MultiScaleDeformableAttention = None
+else:
+    MultiScaleDeformableAttention = None
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+
+class MultiScaleDeformableAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        context,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        context.im2col_step = im2col_step
+        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            context.im2col_step,
+        )
+        context.save_for_backward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(context, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = context.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            context.im2col_step,
+        )
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_timm_available():
+    from timm import create_model
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GroundingDINOConfig"
+_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny"
+
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "idea-research/grg-dino-tiny",
+    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
+]
+
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO
+class GroundingDINODecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the GroundingDINODecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Grounding DINO encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    init_reference_points: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO
+class GroundingDINOObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`GroundingDINOForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional = None
+    enc_outputs_coord_logits: Optional = None
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO
+class GroundingDINOFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = GroundingDINOFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO
+class GroundingDINOConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO
+class GroundingDINOConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
+    """
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
+
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO
+class GroundingDINOSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
+class GroundingDINOLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Grounding DINO.
+    """
+
+    def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
+        super().__init__()
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        nn.init.constant_(self.attention_weights.weight.data, 0.0)
+        nn.init.constant_(self.attention_weights.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.value_proj.weight.data)
+        nn.init.constant_(self.value_proj.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.output_proj.weight.data)
+        nn.init.constant_(self.output_proj.bias.data, 0.0)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        if self.disable_custom_kernels:
+            # PyTorch implementation
+            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        else:
+            try:
+                # custom kernel
+                output = MultiScaleDeformableAttentionFunction.apply(
+                    value,
+                    spatial_shapes,
+                    level_start_index,
+                    sampling_locations,
+                    attention_weights,
+                    self.im2col_step,
+                )
+            except Exception:
+                # PyTorch implementation
+                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINOMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
+class GroundingDINOEncoderLayer(nn.Module):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = GroundingDINOMultiscaleDeformableAttention(
+            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
+class GroundingDINODecoderLayer(nn.Module):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = GroundingDINOMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = GroundingDINOMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
+class GroundingDINOClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO
+class GroundingDINOPreTrainedModel(PreTrainedModel):
+    config_class = GroundingDINOConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, GroundingDINOLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
+            module._reset_parameters()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GroundingDINODecoder):
+            module.gradient_checkpointing = value
+
+
+GROUNDING_DINO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GroundingDINOConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GROUNDING_DINO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+            for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO
+class GroundingDINOEncoder(GroundingDINOPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`GroundingDINOEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: GroundingDINOConfig
+    """
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
+class GroundingDINODecoder(GroundingDINOPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Grounding DINO:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: GroundingDINOConfig
+    """
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Grounding DINO
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return GroundingDINODecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
+class GroundingDINOModel(GroundingDINOPreTrainedModel):
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = GroundingDINOConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        if not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+        self.encoder = GroundingDINOEncoder(config)
+        self.decoder = GroundingDINODecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
+
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, GroundingDINOModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(features):
+            sources.append(self.input_proj[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj[level](features[-1][0])
+                else:
+                    source = self.input_proj[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        batch_size, _, num_channels = encoder_outputs[0].shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation for two-stage Grounding DINO
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.config.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return GroundingDINOModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
+
+
+@add_start_docstrings(
+    """
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
+class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+
+    def __init__(self, config: GroundingDINOConfig):
+        super().__init__(config)
+
+        # Grounding DINO encoder-decoder model
+        self.model = GroundingDINOModel(config)
+
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = GroundingDINOMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values,
+        pixel_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
+        Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
+        Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = GroundingDINOHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = GroundingDINOLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
+                outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = GroundingDINOObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+        )
+
+        return dict_outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.dice_loss
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO
+class GroundingDINOLoss(nn.Module):
+    """
+    This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we
+    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
+    matched ground-truth / prediction (supervise class and box).
+
+    Args:
+        matcher (`GroundingDINOHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDINOMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO
+class GroundingDINOHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# Copied from transformers.models.detr.modeling_detr._max_by_axis
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
diff --git a/tests/models/grounding_dino/__init__.py b/tests/models/grounding_dino/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
new file mode 100644
index 00000000000000..3007eef6399916
--- /dev/null
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -0,0 +1,673 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Grounding DINO model. """
+
+
+import inspect
+import math
+import unittest
+from typing import Dict, List, Tuple
+
+from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import (
+    require_timm,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import GroundingDINOForObjectDetection, GroundingDINOModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class GroundingDINOModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        image_size=196,
+        n_targets=8,
+        num_labels=91,
+        num_feature_levels=4,
+        encoder_n_points=2,
+        decoder_n_points=6,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = (
+            math.ceil(self.image_size / 8) ** 2
+            + math.ceil(self.image_size / 16) ** 2
+            + math.ceil(self.image_size / 32) ** 2
+            + math.ceil(self.image_size / 64) ** 2
+        )
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
+        return GroundingDINOConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            num_feature_levels=self.num_feature_levels,
+            encoder_n_points=self.encoder_n_points,
+            decoder_n_points=self.decoder_n_points,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels):
+        model = GroundingDINOModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
+
+    def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = GroundingDINOForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_torch
+class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = GroundingDINOModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False)
+
+    def test_config(self):
+        # we don't test common_properties and arguments_init as these don't apply for Grounding DINO
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+
+    def test_grounding_dino_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs)
+
+    def test_grounding_dino_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="Grounding DINO does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Grounding DINO does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 8
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            # Object Detection model returns pred_logits and pred_boxes
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                correct_outlen += 2
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.decoder_n_points,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        # we take the second output since last_hidden_state is the second item
+        output = outputs[1]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "GroundingDINOForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if param.requires_grad:
+                        if (
+                            "level_embed" in name
+                            or "sampling_offsets.bias" in name
+                            or "value_proj" in name
+                            or "output_proj" in name
+                            or "reference_points" in name
+                        ):
+                            continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_two_stage_training(self):
+        model_class = GroundingDINOForObjectDetection
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        config.two_stage = True
+        config.auxiliary_loss = True
+        config.with_box_refine = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class GroundingDINOModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
+
+    def test_inference_object_detection_head(self):
+        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
+        expected_labels = [17, 17, 75, 75, 63]
+        expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device)
+
+        self.assertEqual(len(results["scores"]), 5)
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
+
+    def test_inference_object_detection_head_with_box_refine_two_stage(self):
+        model = GroundingDINOForObjectDetection.from_pretrained(
+            "SenseTime/deformable-detr-with-box-refine-two-stage"
+        ).to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+    @require_torch_gpu
+    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt")
+        pixel_values = encoding["pixel_values"]
+        pixel_mask = encoding["pixel_mask"]
+
+        # 1. run model on CPU
+        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale")
+
+        with torch.no_grad():
+            cpu_outputs = model(pixel_values, pixel_mask)
+
+        # 2. run model on GPU
+        model.to("cuda")
+
+        with torch.no_grad():
+            gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda"))
+
+        # 3. assert equivalence
+        for key in cpu_outputs.keys():
+            assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
+
+        expected_logits = torch.tensor(
+            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+        )
+        assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4)

From 149b462e673c6735c86198ed21c2470893a7d221 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 23 Aug 2023 12:25:43 -0300
Subject: [PATCH 041/252] First commit

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/tasks/object_detection.md      |   2 +-
 .../configuration_grounding_dino.py           |   6 +-
 .../convert_grounding_dino_to_hf.py           | 242 ++++++++++++++++++
 .../convert_grounding_dino_to_pytorch.py      | 237 -----------------
 .../grounding_dino/modeling_grounding_dino.py |   4 +-
 src/transformers/utils/dummy_pt_objects.py    |  24 ++
 13 files changed, 279 insertions(+), 243 deletions(-)
 create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
 delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py

diff --git a/README.md b/README.md
index 853353ecc379cc..3311a4785b54d7 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_es.md b/README_es.md
index e74485a2fcccdd..e5497cdd9cd8f6 100644
--- a/README_es.md
+++ b/README_es.md
@@ -350,6 +350,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_hd.md b/README_hd.md
index 96c70ce393d66c..7e85a8c53d1713 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -322,6 +322,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 55fc6b3cedd230..8f347bdd79264e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -384,6 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
diff --git a/README_ko.md b/README_ko.md
index 60a46aefe51b05..31418f42b8a9ff 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -299,6 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 7b55646bb27dd2..107ed00f3de87f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -323,6 +323,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 15f56c66889e0c..a633740b292821 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -335,6 +335,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 7511ee66dd0b99..8ed9da455bf7ba 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0b3ae3d74d3475..23cd86fd3f9d44 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -22,7 +22,7 @@
 logger = logging.get_logger(__name__)
 
 GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
+    "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
 
@@ -151,8 +151,8 @@ class GroundingDINOConfig(PretrainedConfig):
 
     def __init__(
         self,
-        use_timm_backbone=True,
-        backbone_config=None,
+        use_timm_backbone=False,
+        backbone_config={"model_type": "swin"},
         num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
new file mode 100644
index 00000000000000..b5de1d8a652c0e
--- /dev/null
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert GroundingDINO SimMIM checkpoints from the original repository.
+
+URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
+
+import argparse
+
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms as T
+import torchvision.transforms.functional as F
+
+from transformers import (
+    GroundingDINOConfig, GroundingDINOForObjectDetection
+)
+
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+
+
+def get_grounding_dino_config(model_name):
+    config = GroundingDINOConfig()
+
+    if "tiny" in model_name:
+        window_size = 7
+        embed_dim = 96
+        depths = (2, 2, 6, 2)
+        num_heads = (3, 6, 12, 24)
+        image_size = 224
+    elif "base" in model_name:
+        window_size = 12
+        embed_dim = 128
+        depths = (2, 2, 18, 2)
+        num_heads = (4, 8, 16, 32)
+        image_size = 384
+    else:
+        raise ValueError("Model not supported, only supports base and large variants")
+
+    config.backbone_config.window_size = window_size
+    config.backbone_config.image_size = image_size
+    config.backbone_config.embed_dim = embed_dim
+    config.backbone_config.depths = depths
+    config.backbone_config.num_heads = num_heads
+    config.backbone_config.out_indices = [2, 3, 4]
+
+    return config
+
+
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+    #TODO names might change after modifing GroundingDINOModel class
+    ########################################## VISION BACKBONE - START
+    # patch embedding layer
+    rename_keys.append(("module.backbone.0.patch_embed.proj.weight", 
+                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("module.backbone.0.patch_embed.proj.bias", 
+                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
+    rename_keys.append(("module.backbone.0.patch_embed.norm.weight", 
+                        "model.backbone.conv_encoder.model.embeddings.norm.weight"))
+    rename_keys.append(("module.backbone.0.patch_embed.norm.bias", 
+                        "model.backbone.conv_encoder.model.embeddings.norm.bias"))
+
+    for layer, depth in enumerate(config.backbone_config.depths):
+        for block in range(depth):
+            # layernorms
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
+            
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+                                f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
+            # attention
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
+            # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
+            # intermidiate
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
+            
+            # output
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
+            
+        # downsample
+        if layer!=len(config.backbone_config.depths)-1:
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
+            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", 
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
+    
+    for out_indice in config.backbone_config.out_indices:
+        # Grounding DINO implementation of out_indices isn't aligned with transformers
+        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", 
+                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
+        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", 
+                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
+        
+    ########################################## VISION BACKBONE - END
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    ########################################## VISION BACKBONE - START
+    embed_dim = config.backbone_config.embed_dim
+    for layer, depth in enumerate(config.backbone_config.depths):
+        hidden_size = embed_dim * 2**layer
+        for block in range(depth):
+            # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+            in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
+            in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
+            # next, add query, keys and values (in that order) to the state dict
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
+
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
+
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :]
+            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :]
+    ########################################## VISION BACKBONE - END
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return image
+
+@torch.no_grad()
+def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
+    #Define default GroundingDINO configuation
+    config = get_grounding_dino_config(model_name)
+
+    # Load original checkpoint
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+
+    # Rename keys
+    new_state_dict = original_state_dict.copy()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(new_state_dict, src, dest)
+    read_in_q_k_v(new_state_dict, config)
+
+    # Load HF implementation with default config and converted state dict
+    model = GroundingDINOForObjectDetection(config).eval()
+    model.load_state_dict(new_state_dict, strict=False)
+
+    # Load and process test image
+    image = prepare_img()
+    image_processor = T.Compose(
+        [
+            T.Resize(size=800, max_size=1333),
+            T.ToTensor(), 
+            T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+        ]
+    )
+    inputs = image_processor(image)
+    pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
+    output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    for feature_map in output.feature_maps:
+        print(f"{feature_map.shape}")
+        print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
+
+    # outputs = model(**inputs).logits
+
+    # print(outputs.keys())
+    # print("Looks ok!")
+
+    # if pytorch_dump_folder_path is not None:
+    #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    #     model.save_pretrained(pytorch_dump_folder_path)
+
+    #     print(f"Saving image processor to {pytorch_dump_folder_path}")
+    #     image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    # if push_to_hub:
+    #     print(f"Pushing model and image processor for {model_name} to hub")
+    #     model.push_to_hub(f"microsoft/{model_name}")
+    #     image_processor.push_to_hub(f"microsoft/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="grounding-dino-tiny",
+        type=str,
+        choices=["grounding-dino-tiny", "grounding-dino-base"],
+        help="Name of the GroundingDINO model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth",
+        type=str,
+        help="Path to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path)
\ No newline at end of file
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
deleted file mode 100644
index d3cef0366b2bca..00000000000000
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Grounding DINO checkpoints."""
-
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import cached_download, hf_hub_url
-from PIL import Image
-
-from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_key(orig_key):
-    if "backbone.0.body" in orig_key:
-        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
-    if "transformer" in orig_key:
-        orig_key = orig_key.replace("transformer.", "")
-    if "norm1" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
-    if "norm2" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm2", "final_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
-    if "norm3" in orig_key:
-        orig_key = orig_key.replace("norm3", "final_layer_norm")
-    if "linear1" in orig_key:
-        orig_key = orig_key.replace("linear1", "fc1")
-    if "linear2" in orig_key:
-        orig_key = orig_key.replace("linear2", "fc2")
-    if "query_embed" in orig_key:
-        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
-    if "cross_attn" in orig_key:
-        orig_key = orig_key.replace("cross_attn", "encoder_attn")
-
-    return orig_key
-
-
-def read_in_q_k_v(state_dict):
-    # transformer decoder self-attention layers
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_grounding_dino_checkpoint(
-    checkpoint_path,
-    single_scale,
-    dilation,
-    with_box_refine,
-    two_stage,
-    pytorch_dump_folder_path,
-    push_to_hub,
-):
-    """
-    Copy/paste/tweak model's weights to our Grounding DINO structure.
-    """
-
-    # load default config
-    config = GroundingDINOConfig()
-    # set config attributes
-    if single_scale:
-        config.num_feature_levels = 1
-    config.dilation = dilation
-    config.with_box_refine = with_box_refine
-    config.two_stage = two_stage
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    image_processor = DeformableDetrImageProcessor(format="coco_detection")
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = GroundingDINOForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    # verify our conversion
-    outputs = model(pixel_values.to(device))
-
-    expected_logits = torch.tensor(
-        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
-    )
-    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
-
-    if single_scale:
-        expected_logits = torch.tensor(
-            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
-        )
-        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
-
-    if single_scale and dilation:
-        expected_logits = torch.tensor(
-            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
-        )
-        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
-
-    if with_box_refine:
-        expected_logits = torch.tensor(
-            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
-        )
-        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
-
-    if with_box_refine and two_stage:
-        expected_logits = torch.tensor(
-            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
-        )
-        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
-
-    print("Logits:", outputs.logits[0, :3, :3])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-
-    print("Everything ok!")
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        model_name = "deformable-detr"
-        model_name += "-single-scale" if single_scale else ""
-        model_name += "-dc5" if dilation else ""
-        model_name += "-with-box-refine" if with_box_refine else ""
-        model_name += "-two-stage" if two_stage else ""
-        print("Pushing model to hub...")
-        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth",
-        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
-    )
-    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
-    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
-    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
-    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_grounding_dino_checkpoint(
-        args.checkpoint_path,
-        args.single_scale,
-        args.dilation,
-        args.with_box_refine,
-        args.two_stage,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ee80a562e4b851..603bdfdd8e8126 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -122,10 +122,10 @@ def backward(context, grad_output):
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDINOConfig"
-_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny"
+_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny"
 
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "idea-research/grg-dino-tiny",
+    "idea-research/grounding-dino-tiny",
     # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
 ]
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2991bca449b3c7..22f24222f67514 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2486,6 +2486,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GroundingDINOForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From 92c31bfa6ae676313b48e88adbf53628167dbb8f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 27 Aug 2023 01:47:21 -0300
Subject: [PATCH 042/252] Added bert to model

---
 .../configuration_grounding_dino.py           |   7 +-
 .../convert_grounding_dino_to_hf.py           |  13 +-
 .../grounding_dino/modeling_grounding_dino.py | 686 +++++++++++++++++-
 3 files changed, 692 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 23cd86fd3f9d44..9025d01e725561 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -16,7 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,8 @@ class GroundingDINOConfig(PretrainedConfig):
         backbone_config (`PretrainedConfig` or `dict`, *optional*):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
+        text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`):
+            The configuration of the text backbone model. Should be a bert-like config.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 300):
@@ -153,6 +155,7 @@ def __init__(
         self,
         use_timm_backbone=False,
         backbone_config={"model_type": "swin"},
+        text_backbone_config="bert-base-uncased",
         num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
@@ -251,6 +254,8 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
+        # Text backbone
+        self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index b5de1d8a652c0e..d5b07b32c3f49f 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -60,7 +60,7 @@ def get_grounding_dino_config(model_name):
     return config
 
 
-def create_rename_keys(config):
+def create_rename_keys(state_dict, config):
     rename_keys = []
     # fmt: off
     #TODO names might change after modifing GroundingDINOModel class
@@ -126,10 +126,14 @@ def create_rename_keys(config):
         
     ########################################## VISION BACKBONE - END
 
+    ########################################## TEXT BACKBONE - START
+    for layer_name, params in state_dict.items():
+        if "module.bert" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
+    ########################################## TEXT BACKBONE - END
     # fmt: on
     return rename_keys
 
-
 def rename_key(dct, old, new):
     val = dct.pop(old)
     dct[new] = val
@@ -172,7 +176,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
 
     # Rename keys
     new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config)
+    rename_keys = create_rename_keys(original_state_dict, config)
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)
@@ -192,7 +196,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
     inputs = image_processor(image)
     pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
-    output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+
     for feature_map in output.feature_maps:
         print(f"{feature_map.shape}")
         print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 603bdfdd8e8126..8bea6eee50096e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -19,7 +19,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -39,8 +39,13 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...modeling_outputs import BaseModelOutput
+from ...modeling_outputs import (
+    BaseModelOutput, 
+    BaseModelOutputWithPoolingAndCrossAttentions, 
+    BaseModelOutputWithPastAndCrossAttentions
+)
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...pytorch_utils import meshgrid
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
@@ -173,7 +178,7 @@ class GroundingDINODecoderOutput(ModelOutput):
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOModelOutput(ModelOutput):
     """
-    Base class for outputs of the Grounding DINO encoder-decoder model.
+    Base class for outputs of the Deformable DETR encoder-decoder model.
 
     Args:
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
@@ -432,6 +437,7 @@ def __init__(self, config):
                     if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
                         parameter.requires_grad_(False)
 
+    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
         features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
@@ -600,7 +606,7 @@ def multi_scale_deformable_attention(
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOMultiscaleDeformableAttention(nn.Module):
     """
-    Multiscale deformable attention as proposed in Grounding DINO.
+    Multiscale deformable attention as proposed in Deformable DETR.
     """
 
     def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
@@ -736,7 +742,7 @@ class GroundingDINOMultiheadAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
 
-    Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper).
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
     """
 
     def __init__(
@@ -1294,7 +1300,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some tweaks for Grounding DINO:
+    Some tweaks for Deformable DETR:
 
     - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
     - it also returns a stack of intermediate outputs and reference points from all decoding layers.
@@ -1310,7 +1316,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.gradient_checkpointing = False
 
-        # hack implementation for iterative bounding box refinement and two-stage Grounding DINO
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
         self.bbox_embed = None
         self.class_embed = None
 
@@ -1493,6 +1499,8 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+        # Create Text Extractor
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -1772,7 +1780,7 @@ def forward(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
             )
 
-            # hack implementation for two-stage Grounding DINO
+            # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
             enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
@@ -1850,7 +1858,7 @@ class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
 
-        # Grounding DINO encoder-decoder model
+        # Deformable DETR encoder-decoder model
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
@@ -2178,6 +2186,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         return losses
 
     @torch.no_grad()
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
     def loss_cardinality(self, outputs, targets, indices, num_boxes):
         """
         Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
@@ -2193,6 +2202,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes):
         losses = {"cardinality_error": card_err}
         return losses
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -2217,12 +2227,14 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
         losses["loss_giou"] = loss_giou.sum() / num_boxes
         return losses
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
     def _get_source_permutation_idx(self, indices):
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
         source_idx = torch.cat([source for (source, _) in indices])
         return batch_idx, source_idx
 
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx
     def _get_target_permutation_idx(self, indices):
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
@@ -2511,3 +2523,659 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     else:
         raise ValueError("Only 3-dimensional tensors are supported")
     return NestedTensor(tensor, mask)
+
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText
+class GroundingDINOTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
+class GroundingDINOTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText
+class GroundingDINOTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText
+class GroundingDINOTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = GroundingDINOTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText
+class GroundingDINOTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText
+class GroundingDINOTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText
+class GroundingDINOTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GroundingDINOTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = GroundingDINOTextIntermediate(config)
+        self.output = GroundingDINOTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText
+class GroundingDINOTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText
+class GroundingDINOTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
+class GroundingDINOTextModel(nn.Module):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = GroundingDINOTextEmbeddings(config)
+        self.encoder = GroundingDINOTextEncoder(config)
+
+        self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )

From 8f0a755c18f2d4065a5008bbd4202cbf44aa8a74 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 29 Aug 2023 23:30:53 -0300
Subject: [PATCH 043/252] Bert validated

---
 .../configuration_grounding_dino.py           |   7 +-
 .../convert_grounding_dino_to_hf.py           | 105 ++++++++++++++++--
 .../grounding_dino/modeling_grounding_dino.py |   5 +-
 3 files changed, 106 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 9025d01e725561..0b4df30f6ee46f 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -177,7 +177,7 @@ def __init__(
         return_intermediate=True,
         auxiliary_loss=False,
         position_embedding_type="sine",
-        backbone="resnet50",
+        backbone="swin",
         use_pretrained_backbone=True,
         dilation=False,
         num_feature_levels=4,
@@ -196,6 +196,9 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
+        #other parameters
+        max_text_len = 256,
+        sub_sentence_present = True,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -256,6 +259,8 @@ def __init__(
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
         self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
+        self.max_text_len = max_text_len
+        self.sub_sentence_present = sub_sentence_present
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index d5b07b32c3f49f..d5ebc9281b8733 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -25,7 +25,7 @@
 import torchvision.transforms.functional as F
 
 from transformers import (
-    GroundingDINOConfig, GroundingDINOForObjectDetection
+    GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer
 )
 
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
@@ -166,6 +166,88 @@ def prepare_img():
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     return image
 
+def text_processor(text: str, config):
+    def preprocess_caption(caption: str) -> str:
+        result = caption.lower().strip()
+        if result.endswith("."):
+            return result
+        return result + "."
+    def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
+        """Generate attention mask between each pair of special tokens
+        Args:
+            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+            special_tokens_mask (list): special tokens mask.
+        Returns:
+            torch.Tensor: attention mask between each special tokens.
+        """
+        input_ids = tokenized["input_ids"]
+        bs, num_token = input_ids.shape
+        # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+        special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+        for special_token in special_tokens_list:
+            special_tokens_mask |= input_ids == special_token
+
+        # idxs: each row is a list of indices of special tokens
+        idxs = torch.nonzero(special_tokens_mask)
+
+        # generate attention mask and positional ids
+        attention_mask = (
+            torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+        )
+        position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+        cate_to_token_mask_list = [[] for _ in range(bs)]
+        previous_col = 0
+        for i in range(idxs.shape[0]):
+            row, col = idxs[i]
+            if (col == 0) or (col == num_token - 1):
+                attention_mask[row, col, col] = True
+                position_ids[row, col] = 0
+            else:
+                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+                position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                    0, col - previous_col, device=input_ids.device
+                )
+                c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
+                c2t_maski[previous_col + 1 : col] = True
+                cate_to_token_mask_list[row].append(c2t_maski)
+            previous_col = col
+
+        cate_to_token_mask_list = [
+            torch.stack(cate_to_token_mask_listi, dim=0)
+            for cate_to_token_mask_listi in cate_to_token_mask_list
+        ]
+
+        # # padding mask
+        # padding_mask = tokenized['attention_mask']
+        # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+        return attention_mask, position_ids.to(torch.long)
+    tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
+    special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+    text = preprocess_caption(text)
+    tokenized = tokenizer([text], padding="longest", return_tensors="pt")
+    text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_tokens)
+    
+    max_text_len = config.max_text_len
+    sub_sentence_present = config.sub_sentence_present
+    if text_self_attention_masks.shape[1] > max_text_len:
+        text_self_attention_masks = text_self_attention_masks[
+            :, : max_text_len, : max_text_len
+        ]
+        position_ids = position_ids[:, : max_text_len]
+        tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len]
+
+    # extract text embeddings
+    if sub_sentence_present:
+        tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+        tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+        tokenized_for_encoder["position_ids"] = position_ids
+
+    return tokenized_for_encoder
+
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     #Define default GroundingDINO configuation
@@ -187,6 +269,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
 
     # Load and process test image
     image = prepare_img()
+    text = "a cat"
     image_processor = T.Compose(
         [
             T.Resize(size=800, max_size=1333),
@@ -194,13 +277,21 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
             T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
         ]
     )
-    inputs = image_processor(image)
-    pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device)
-    output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0))
+    image_inputs = image_processor(image)
+    text_inputs = text_processor(text, config)
+
+    pixel_mask = torch.ones(
+        ((1, image_inputs.shape[1], image_inputs.shape[2])), 
+        dtype=torch.long, 
+        device=image_inputs.device
+    )
+    # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0))
+    output = model.model.text_backbone(**text_inputs)
+    print(output.last_hidden_state[:, :, :5])
 
-    for feature_map in output.feature_maps:
-        print(f"{feature_map.shape}")
-        print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
+    # for feature_map in output.last_hidden_state:
+    #     print(f"{feature_map.shape}")
+    #     print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
 
     # outputs = model(**inputs).logits
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8bea6eee50096e..ebe151de480211 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3014,7 +3014,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 # Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
-class GroundingDINOTextModel(nn.Module):
+class GroundingDINOTextModel(PreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -3028,8 +3028,7 @@ class GroundingDINOTextModel(nn.Module):
     """
 
     def __init__(self, config, add_pooling_layer=True):
-        super().__init__()
-        self.config = config
+        super().__init__(config)
 
         self.embeddings = GroundingDINOTextEmbeddings(config)
         self.encoder = GroundingDINOTextEncoder(config)

From fb1c55c3d9ad42769ba7c16e6ab2643fa264a21c Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 31 Aug 2023 20:03:28 -0300
Subject: [PATCH 044/252] Created Text and Fusion layers for Encoder

---
 .../configuration_grounding_dino.py           |   2 +-
 .../grounding_dino/modeling_grounding_dino.py | 309 +++++++++++++++++-
 2 files changed, 306 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0b4df30f6ee46f..e77d4be247b746 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -160,7 +160,7 @@ def __init__(
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
-        encoder_ffn_dim=1024,
+        encoder_ffn_dim=2048,
         encoder_attention_heads=8,
         decoder_layers=6,
         decoder_ffn_dim=1024,
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ebe151de480211..731172570c23d2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -854,6 +854,304 @@ def forward(
 
         return attn_output, attn_weights_reshaped
 
+# Repeting some code to avoid convert nn.MultiheadAttention later
+class GroundingDINOEncoderTextLayer(nn.Module):
+    def __init__(
+            self, 
+            embed_dim,
+            num_heads,
+            ffn_dim: int,
+            dropout: float = 0.0,
+            bias: bool = True,
+            activation: str = 'relu'
+            ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.fc1 = nn.Linear(embed_dim, ffn_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(ffn_dim, embed_dim)
+
+        self.layer_norm_before = nn.LayerNorm(embed_dim)
+        self.layer_norm_after = nn.LayerNorm(embed_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+        self.num_heads = num_heads
+
+    def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
+        return hidden_state if position_embeddings is None else hidden_state + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_masks: Optional[Tensor] = None,
+        position_embeddings: Optional[Tensor] = None,
+    ):    # repeat attn mask
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]:
+            # bs, num_q, num_k
+            attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
+
+        q = k = self.with_pos_embed(hidden_states, position_embeddings)
+        attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0]
+
+        hidden_states = hidden_states + self.dropout1(attention_output)
+        hidden_states = self.layer_norm_before(hidden_states)
+        hidden_states = self.activation(self.fc1(hidden_states))
+        attention_output = self.fc2(self.dropout(hidden_states))
+        hidden_states = hidden_states + self.dropout2(attention_output)
+        hidden_states = self.layer_norm_after(hidden_states)
+        return hidden_states
+
+class BiMultiHeadAttention(nn.Module):
+    def __init__(
+            self,
+            vision_dim: int,
+            text_dim: int,
+            embed_dim: int,
+            num_heads: int,
+            dropout:float = 0.1
+        ):
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.vision_dim = vision_dim
+        self.text_dim = text_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_dim, self.embed_dim)
+        self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim)
+
+        self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
+        self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.vision_proj.weight)
+        self.vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.text_proj.weight)
+        self.text_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_vision_proj.weight)
+        self.values_vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_text_proj.weight)
+        self.values_text_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_vision_proj.weight)
+        self.out_vision_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_text_proj.weight)
+        self.out_text_proj.bias.data.fill_(0)
+
+    def forward(
+            self, 
+            vision_features: Tensor, 
+            text_features: Tensor, 
+            vision_attention_mask: Optional[Tensor] = None, 
+            text_attention_mask: Optional[Tensor] = None
+        ):
+        """_summary_
+
+        Args:
+            vision_features Tensor: bs, n_img, dim
+            text_features Tensor: bs, n_text, dim
+            vision_attention_mask (Tensor, optional): _description_. bs, n_img
+            text_attention_mask (Tensor, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = vision_features.size()
+
+        vision_query_states = self.vision_proj(vision_features) * self.scale
+        vision_query_states = self._shape(vision_query_states, tgt_len, bsz)
+
+        text_key_states = self.text_proj(text_features)
+        text_key_states = self._shape(text_key_states, -1, bsz)
+
+        vision_value_states = self.values_vision_proj(vision_features)
+        vision_value_states = self._shape(vision_value_states, -1, bsz)
+
+        text_value_states = self.values_text_proj(text_features)
+        text_value_states = self._shape(text_value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+
+        vision_query_states = vision_query_states.view(*proj_shape)
+        text_key_states = text_key_states.view(*proj_shape)
+        vision_value_states = vision_value_states.view(*proj_shape)
+        text_value_states = text_value_states.view(*proj_shape)
+
+        src_len = text_key_states.size(1)
+        attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        attn_weights = attn_weights - attn_weights.max()
+
+        attn_weights = torch.clamp(
+                attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        attn_weights = torch.clamp(
+                attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
+        
+        text_attn_weights = torch.clamp(
+                text_attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        text_attn_weights = torch.clamp(
+                text_attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if vision_attention_mask is not None:
+            vision_attention_mask = (
+                vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            text_attn_weights.masked_fill_(vision_attention_mask, float("-inf"))
+
+        text_attn_weights = text_attn_weights.softmax(dim=-1)
+
+        # mask language for vision
+        if text_attention_mask is not None:
+            text_attention_mask = (
+                text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            attn_weights.masked_fill_(text_attention_mask, float("-inf"))
+        vision_attn_weights = attn_weights.softmax(dim=-1)
+
+        vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training)
+        text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training)
+
+        vision_attn_output = torch.bmm(vision_attn_probs, text_value_states)
+        text_attn_output = torch.bmm(text_attn_probs, vision_value_states)
+
+        if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
+            )
+
+        if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
+            )
+
+        vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        vision_attn_output = vision_attn_output.transpose(1, 2)
+        vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim)
+        text_attn_output = text_attn_output.transpose(1, 2)
+        text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim)
+
+        vision_attn_output = self.out_vision_proj(vision_attn_output)
+        text_attn_output = self.out_text_proj(text_attn_output)
+
+        return vision_attn_output, text_attn_output
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO
+class GroundingDINODropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+    
+class GroundingDINOBiAttention(nn.Module):
+    def __init__(
+        self,
+        vision_dim,
+        text_dim,
+        embed_dim,
+        num_heads,
+        dropout=0.1,
+        drop_path=0.0,
+        init_values=1e-4,
+    ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super().__init__()
+
+        # pre layer norm
+        self.layer_norm_vision = nn.LayerNorm(vision_dim)
+        self.layer_norm_text = nn.LayerNorm(text_dim)
+        self.attn = BiMultiHeadAttention(
+            vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        )
+
+        # add layer scale for training stability
+        self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True)
+
+    def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
+        vision_features = self.layer_norm_vision(vision_features)
+        text_features = self.layer_norm_text(text_features)
+        delta_v, delta_l = self.attn(
+            vision_features, 
+            text_features, 
+            attention_mask_vision=attention_mask_vision, 
+            attention_mask_text=attention_mask_text
+        )
+        # vision_features, text_features = vision_features + delta_v, text_features + delta_l
+        vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
+        text_features = text_features + self.drop_path(self.gamma_l * delta_l)
+        return vision_features, text_features
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINOEncoderLayer(nn.Module):
@@ -1499,8 +1797,6 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
-        # Create Text Extractor
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -1850,7 +2146,6 @@ def forward(
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
 class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
     _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
@@ -1866,6 +2161,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)
@@ -2588,6 +2884,8 @@ def forward(
         embeddings = self.dropout(embeddings)
         return embeddings
 
+# Classes for Text Backbone (It's just a BERT model)
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
 class GroundingDINOTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3013,7 +3311,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         pooled_output = self.activation(pooled_output)
         return pooled_output
 
-# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText
 class GroundingDINOTextModel(PreTrainedModel):
     """
 
@@ -3029,12 +3326,16 @@ class GroundingDINOTextModel(PreTrainedModel):
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
+        self.config = config
 
         self.embeddings = GroundingDINOTextEmbeddings(config)
         self.encoder = GroundingDINOTextEncoder(config)
 
         self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 

From 86131aff2aee36051ce1a9fef81fa552152aea12 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 31 Aug 2023 20:59:26 -0300
Subject: [PATCH 045/252] Adapted Encoder layer

---
 .../configuration_grounding_dino.py           |   8 +
 .../grounding_dino/modeling_grounding_dino.py | 180 +++++++++++++-----
 2 files changed, 137 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e77d4be247b746..3abf4912ebb651 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -199,6 +199,9 @@ def __init__(
         #other parameters
         max_text_len = 256,
         sub_sentence_present = True,
+        text_enhancer_dropout = 0.0,
+        fusion_droppath = 0.1,
+        fusion_dropout = 0.0,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -261,6 +264,11 @@ def __init__(
         self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
         self.max_text_len = max_text_len
         self.sub_sentence_present = sub_sentence_present
+        # Text Enhancer
+        self.text_enhancer_dropout = text_enhancer_dropout
+        # Fusion
+        self.fusion_droppath = fusion_droppath
+        self.fusion_dropout = fusion_dropout
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 731172570c23d2..91129946c6141e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -855,30 +855,28 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 # Repeting some code to avoid convert nn.MultiheadAttention later
-class GroundingDINOEncoderTextLayer(nn.Module):
-    def __init__(
-            self, 
-            embed_dim,
-            num_heads,
-            ffn_dim: int,
-            dropout: float = 0.0,
-            bias: bool = True,
-            activation: str = 'relu'
-            ):
+#TODO is this an approriate way to name this?
+class GroundingDINOTextEnhancerLayer(nn.Module):
+    """Vanilla Transformer with text embeddings as input"""
+    def __init__(self, config):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=config.d_model, 
+            num_heads=config.num_heads // 2, 
+            dropout=config.text_enhancer_dropout
+            )
         # Implementation of Feedforward model
-        self.fc1 = nn.Linear(embed_dim, ffn_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.fc2 = nn.Linear(ffn_dim, embed_dim)
+        self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
+        self.dropout = nn.Dropout(config.text_enhancer_dropout)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
 
-        self.layer_norm_before = nn.LayerNorm(embed_dim)
-        self.layer_norm_after = nn.LayerNorm(embed_dim)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
+        self.layer_norm_before = nn.LayerNorm(config.d_model)
+        self.layer_norm_after = nn.LayerNorm(config.d_model)
+        self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
+        self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
-        self.activation = ACT2FN[activation]
-        self.num_heads = num_heads
+        self.activation = ACT2FN[config.activation_fuction]
+        self.num_heads = config.num_heads // 2
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -903,8 +901,8 @@ def forward(
         hidden_states = hidden_states + self.dropout2(attention_output)
         hidden_states = self.layer_norm_after(hidden_states)
         return hidden_states
-
-class BiMultiHeadAttention(nn.Module):
+    
+class GroundingDINOBiMultiHeadAttention(nn.Module):
     def __init__(
             self,
             vision_dim: int,
@@ -1106,38 +1104,26 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
     
-class GroundingDINOBiAttention(nn.Module):
-    def __init__(
-        self,
-        vision_dim,
-        text_dim,
-        embed_dim,
-        num_heads,
-        dropout=0.1,
-        drop_path=0.0,
-        init_values=1e-4,
-    ):
-        """
-        Inputs:
-            embed_dim - Dimensionality of input and attention feature vectors
-            hidden_dim - Dimensionality of hidden layer in feed-forward network
-                         (usually 2-4x larger than embed_dim)
-            num_heads - Number of heads to use in the Multi-Head Attention block
-            dropout - Amount of dropout to apply in the feed-forward network
-        """
+class GroundingDINOFusionLayer(nn.Module):
+    def __init__(self, config, init_values=1e-4):
         super().__init__()
+        drop_path = config.fusion_droppath
 
         # pre layer norm
-        self.layer_norm_vision = nn.LayerNorm(vision_dim)
-        self.layer_norm_text = nn.LayerNorm(text_dim)
-        self.attn = BiMultiHeadAttention(
-            vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        self.layer_norm_vision = nn.LayerNorm(config.d_model)
+        self.layer_norm_text = nn.LayerNorm(config.d_model)
+        self.attn = GroundingDINOBiMultiHeadAttention(
+            vision_dim=config.d_model, 
+            text_dim=config.d_model, 
+            embed_dim=config.encoder_ffn_dim // 2, 
+            num_heads=config.num_heads // 2, 
+            dropout=config.fusion_dropout
         )
 
         # add layer scale for training stability
         self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True)
-        self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True)
+        self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
@@ -1153,8 +1139,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         text_features = text_features + self.drop_path(self.gamma_l * delta_l)
         return vision_features, text_features
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO
-class GroundingDINOEncoderLayer(nn.Module):
+#NOTE just renamed the class
+class GroundingDINODeformableLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
         self.embed_dim = config.d_model
@@ -1238,6 +1224,98 @@ def forward(
 
         return outputs
 
+def get_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+    ) -> Tensor:
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+
+
+class GroundingDINOEncoderLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init_()
+        self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
+        self.fusion_layer = GroundingDINOFusionLayer(config)
+        self.deformable_layer = GroundingDINODeformableLayer(config)
+
+    def forward(
+            self,
+            vision_features: Tensor,
+            vision_position_embedding: Tensor,
+            spatial_shapes: Tensor,
+            level_start_index: Tensor,
+            key_padding_mask: Tensor,
+            reference_points: Tensor,
+            text_features: Optional[Tensor] = None,
+            text_attention_mask: Optional[Tensor] = None,
+            text_position_embedding: Optional[Tensor] = None,
+            text_self_attention_masks: Optional[Tensor] = None,
+            text_position_ids: Optional[Tensor] = None
+        ):
+        bs, n_text, text_dim = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            pos_text = (
+                torch.arange(n_text, device=text_features.device)
+                .float()
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .repeat(bs, 1, 1)
+            )
+            pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
+            )
+
+        vision_features, text_features = self.fusion_layer(
+            vision_features=vision_features,
+            text_features=text_features,
+            attention_mask_vision=key_padding_mask,
+            attention_mask_text=text_attention_mask,
+        )
+
+        text_features = self.text_enhancer_layer(
+            hidden_states=text_features.transpose(0, 1),
+            attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
+            position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None),
+        ).transpose(0, 1)
+
+        vision_features = self.deformable_layer(
+            hidden_states=vision_features,
+            attention_mask=key_padding_mask,
+            position_embeddings=vision_position_embedding,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+        )
+
+        return vision_features, text_features
+
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINODecoderLayer(nn.Module):
@@ -1788,7 +1866,6 @@ def custom_forward(*inputs):
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO
 class GroundingDINOModel(GroundingDINOPreTrainedModel):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
@@ -1797,6 +1874,8 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+        # Create text backbone
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -2161,7 +2240,6 @@ def __init__(self, config: GroundingDINOConfig):
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)

From 8ad3226e297beadd5efdccbbdcadca98989d625e Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 1 Sep 2023 11:37:07 -0300
Subject: [PATCH 046/252] Fixed typos

---
 .../grounding_dino/modeling_grounding_dino.py | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 91129946c6141e..984587d3997d67 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -862,7 +862,7 @@ def __init__(self, config):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(
             embed_dim=config.d_model, 
-            num_heads=config.num_heads // 2, 
+            num_heads=config.encoder_attention_heads // 2, 
             dropout=config.text_enhancer_dropout
             )
         # Implementation of Feedforward model
@@ -875,8 +875,8 @@ def __init__(self, config):
         self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
         self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
-        self.activation = ACT2FN[config.activation_fuction]
-        self.num_heads = config.num_heads // 2
+        self.activation = ACT2FN[config.activation_function]
+        self.num_heads = config.encoder_attention_heads // 2
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -1116,7 +1116,7 @@ def __init__(self, config, init_values=1e-4):
             vision_dim=config.d_model, 
             text_dim=config.d_model, 
             embed_dim=config.encoder_ffn_dim // 2, 
-            num_heads=config.num_heads // 2, 
+            num_heads=config.encoder_attention_heads // 2, 
             dropout=config.fusion_dropout
         )
 
@@ -1258,25 +1258,25 @@ def sine_func(x: torch.Tensor):
 
 class GroundingDINOEncoderLayer(nn.Module):
     def __init__(self, config) -> None:
-        super().__init_()
+        super().__init__()
         self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
         self.fusion_layer = GroundingDINOFusionLayer(config)
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
     def forward(
-            self,
-            vision_features: Tensor,
-            vision_position_embedding: Tensor,
-            spatial_shapes: Tensor,
-            level_start_index: Tensor,
-            key_padding_mask: Tensor,
-            reference_points: Tensor,
-            text_features: Optional[Tensor] = None,
-            text_attention_mask: Optional[Tensor] = None,
-            text_position_embedding: Optional[Tensor] = None,
-            text_self_attention_masks: Optional[Tensor] = None,
-            text_position_ids: Optional[Tensor] = None
-        ):
+        self,
+        vision_features: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
+        key_padding_mask: Tensor,
+        reference_points: Tensor,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None
+    ):
         bs, n_text, text_dim = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
             pos_text = (

From 21e3fa2f70ee396268c2af1e3774db976aa91075 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 4 Sep 2023 13:08:37 -0300
Subject: [PATCH 047/252] Adjusted Encoder

---
 .../grounding_dino/modeling_grounding_dino.py | 234 +++++++++++++-----
 1 file changed, 176 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 984587d3997d67..229c5d89c716f9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -173,6 +173,55 @@ class GroundingDINODecoderOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+@dataclass
+class GroundingDINOEncoderOutput(ModelOutput):
+    """
+    Base class for outputs of the GroundingDINOEncoder. This class extends
+    BaseModelOutput, due to:
+    - vision and text last hidden states
+    - vision and text intermediate hidden states
+    - vision and text attentions
+    - vision and text cross attentions
+
+    Args:
+        last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the vision encoder.
+        last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the text encoder.
+        hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer
+            plus the initial embedding outputs.
+        hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer
+            plus the initial embedding outputs.
+        attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+    """
+    last_hidden_state_vision: torch.FloatTensor = None
+    last_hidden_state_text: torch.FloatTensor = None
+    hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+
 
 @dataclass
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
@@ -892,7 +941,7 @@ def forward(
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
-        attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0]
+        attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)
 
         hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
@@ -900,7 +949,7 @@ def forward(
         attention_output = self.fc2(self.dropout(hidden_states))
         hidden_states = hidden_states + self.dropout2(attention_output)
         hidden_states = self.layer_norm_after(hidden_states)
-        return hidden_states
+        return hidden_states, attention_weights
     
 class GroundingDINOBiMultiHeadAttention(nn.Module):
     def __init__(
@@ -933,10 +982,6 @@ def __init__(
         self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
         self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
 
-        self.stable_softmax_2d = True
-        self.clamp_min_for_underflow = True
-        self.clamp_max_for_overflow = True
-
         self._reset_parameters()
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
@@ -1068,7 +1113,7 @@ def forward(
         vision_attn_output = self.out_vision_proj(vision_attn_output)
         text_attn_output = self.out_text_proj(text_attn_output)
 
-        return vision_attn_output, text_attn_output
+        return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
 def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
@@ -1128,16 +1173,16 @@ def __init__(self, config, init_values=1e-4):
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
         text_features = self.layer_norm_text(text_features)
-        delta_v, delta_l = self.attn(
+        (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
             vision_features, 
             text_features, 
             attention_mask_vision=attention_mask_vision, 
             attention_mask_text=attention_mask_text
         )
-        # vision_features, text_features = vision_features + delta_v, text_features + delta_l
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
-        text_features = text_features + self.drop_path(self.gamma_l * delta_l)
-        return vision_features, text_features
+        text_features = text_features + self.drop_path(self.gamma_l * delta_t)
+
+        return (vision_features, vision_attn), (text_features, text_attn)
 
 #NOTE just renamed the class
 class GroundingDINODeformableLayer(nn.Module):
@@ -1263,6 +1308,29 @@ def __init__(self, config) -> None:
         self.fusion_layer = GroundingDINOFusionLayer(config)
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
+    def get_text_position_embeddings(
+            self, 
+            text_features: Tensor, 
+            text_position_embedding: Tensor, 
+            text_position_ids: Tensor
+        ) -> Tensor:
+        bs, n_text, text_dim = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            text_position_embedding = (
+                torch.arange(n_text, device=text_features.device)
+                .float()
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .repeat(bs, 1, 1)
+            )
+            text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
+            )
+        
+        return text_position_embedding
+
     def forward(
         self,
         vision_features: Tensor,
@@ -1277,35 +1345,28 @@ def forward(
         text_self_attention_masks: Optional[Tensor] = None,
         text_position_ids: Optional[Tensor] = None
     ):
-        bs, n_text, text_dim = text_features.shape
-        if text_position_embedding is None and text_position_ids is None:
-            pos_text = (
-                torch.arange(n_text, device=text_features.device)
-                .float()
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .repeat(bs, 1, 1)
-            )
-            pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
-        if text_position_ids is not None:
-            text_position_embedding = get_sine_pos_embed(
-                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
-            )
+        text_position_embedding = self.get_text_position_embeddings(
+            text_features, 
+            text_position_embedding, 
+            text_position_ids
+        )
 
-        vision_features, text_features = self.fusion_layer(
+        (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
             vision_features=vision_features,
             text_features=text_features,
             attention_mask_vision=key_padding_mask,
             attention_mask_text=text_attention_mask,
         )
 
-        text_features = self.text_enhancer_layer(
+        (text_features, text_enhanced_attn) = self.text_enhancer_layer(
             hidden_states=text_features.transpose(0, 1),
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None),
+            position_embeddings=(
+                text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None
+            ),
         ).transpose(0, 1)
 
-        vision_features = self.deformable_layer(
+        (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,
             attention_mask=key_padding_mask,
             position_embeddings=vision_position_embedding,
@@ -1314,7 +1375,10 @@ def forward(
             level_start_index=level_start_index,
         )
 
-        return vision_features, text_features
+        return (
+            (vision_features, text_features), 
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn)
+        )
 
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
@@ -1538,7 +1602,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO
 class GroundingDINOEncoder(GroundingDINOPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
@@ -1592,26 +1655,31 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
 
     def forward(
         self,
-        inputs_embeds=None,
-        attention_mask=None,
-        position_embeddings=None,
-        spatial_shapes=None,
-        level_start_index=None,
+        vision_features: Tensor,
+        vision_attention_mask: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
         valid_ratios=None,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
         r"""
         Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
                 - 1 for pixel features that are real (i.e. **not masked**),
                 - 0 for pixel features that are padding (i.e. **masked**).
                 [What are attention masks?](../glossary#attention-mask)
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
             spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                 Spatial shapes of each feature map.
@@ -1619,6 +1687,21 @@ def forward(
                 Starting index of each feature map.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                 Ratio of valid area in each feature level.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Flattened text features that are passed to the encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
+                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
+                Position ids for text features.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1634,41 +1717,76 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = inputs_embeds
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        #TODO check if this is necessary according to original implementation
+        vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training)
 
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
 
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
+        encoder_vision_states = () if output_hidden_states else None
+        encoder_text_states = () if output_hidden_states else None
+        all_attn_fused_text = () if output_attentions else None
+        all_attn_fused_vision = () if output_attentions else None
+        all_attn_enhanced_text = () if output_attentions else None
+        all_attn_deformable = () if output_attentions else None
         for i, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(
-                hidden_states,
-                attention_mask,
-                position_embeddings=position_embeddings,
-                reference_points=reference_points,
+                encoder_vision_states += (vision_features,)
+                encoder_text_states += (text_features,)
+            # INPUTS FOR ENCODER LAYER
+            #   - vision_features: Tensor,
+            #   - vision_position_embedding: Tensor,
+            #   - spatial_shapes: Tensor,
+            #   - level_start_index: Tensor,
+            #   - key_padding_mask: Tensor,
+            #   - reference_points: Tensor,
+            #   - text_features: Optional[Tensor] = None,
+            #   - text_attention_mask: Optional[Tensor] = None,
+            #   - text_position_embedding: Optional[Tensor] = None,
+            #   - text_self_attention_masks: Optional[Tensor] = None,
+            #   - text_position_ids: Optional[Tensor] = None
+            (vision_features, text_features), attentions = encoder_layer(
+                vision_features=vision_features,
+                vision_position_embedding=vision_position_embedding,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
-                output_attentions=output_attentions,
+                key_padding_mask=vision_attention_mask,
+                reference_points=reference_points,
+                text_features=text_features,
+                text_attention_mask=text_attention_mask,
+                text_position_embedding=text_position_embedding,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=text_position_ids
             )
 
-            hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
+                all_attn_fused_vision += (attentions[0],)
+                all_attn_fused_text += (attentions[1],)
+                all_attn_enhanced_text += (attentions[2],)
+                all_attn_deformable += (attentions[3],)
 
         if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
+            encoder_vision_states += (vision_features,)
+            encoder_text_states += (text_features,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            enc_outputs = [
+                vision_features, text_features,
+                all_attn_fused_vision, all_attn_fused_text, 
+                all_attn_enhanced_text, all_attn_deformable
+            ]
+            return tuple(v for v in enc_outputs if v is not None)
+        return GroundingDINOEncoderOutput(
+            last_hidden_state_vision=vision_features,
+            last_hidden_state_text=text_features,
+            hidden_states_vision=encoder_vision_states,
+            hidden_states_text=encoder_text_states,
+            cross_attentions_vision=all_attn_fused_vision,
+            cross_attentions_text=all_attn_fused_text,
+            attentions_vision=all_attn_deformable,
+            attentions_text=all_attn_enhanced_text
         )
 
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """

From 5ddfa38fdf72b55bd793f3451b48274bdec794b0 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 4 Sep 2023 13:09:56 -0300
Subject: [PATCH 048/252] Converted encoder to hf

---
 .../configuration_grounding_dino.py           |  2 +-
 .../convert_grounding_dino_to_hf.py           | 83 +++++++++++++++++++
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 3abf4912ebb651..14e82704cb495b 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -157,7 +157,7 @@ def __init__(
         backbone_config={"model_type": "swin"},
         text_backbone_config="bert-base-uncased",
         num_channels=3,
-        num_queries=300,
+        num_queries=900,
         max_position_embeddings=1024,
         encoder_layers=6,
         encoder_ffn_dim=2048,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index d5ebc9281b8733..f9fc7e87d12bba 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -131,6 +131,88 @@ def create_rename_keys(state_dict, config):
         if "module.bert" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
     ########################################## TEXT BACKBONE - END
+
+    ########################################## ENCODER - START
+    deformable_key_mappings = {
+        'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
+        'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias',
+        'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight',
+        'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias',
+        'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight',
+        'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias',
+        'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight',
+        'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias',
+        'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight',
+        'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias',
+        'linear1.weight': 'deformable_layer.fc1.weight',
+        'linear1.bias': 'deformable_layer.fc1.bias',
+        'linear2.weight': 'deformable_layer.fc2.weight',
+        'linear2.bias': 'deformable_layer.fc2.bias',
+        'norm2.weight': 'deformable_layer.final_layer_norm.weight',
+        'norm2.bias': 'deformable_layer.final_layer_norm.bias',
+    }
+    text_enhancer_key_mappings = {
+        'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight',
+        'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias',
+        'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight',
+        'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias',
+        'linear1.weight': 'text_enhancer_layer.fc1.weight',
+        'linear1.bias': 'text_enhancer_layer.fc1.bias',
+        'linear2.weight': 'text_enhancer_layer.fc2.weight',
+        'linear2.bias': 'text_enhancer_layer.fc2.bias',
+        'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight',
+        'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias',
+        'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight',
+        'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
+    }
+    fusion_key_mappings = {
+        'gamma_v': 'fusion_layer.gamma_v',
+        'gamma_l': 'fusion_layer.gamma_l',
+        'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
+        'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
+        'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
+        'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias',
+        'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight',
+        'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias',
+        'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight',
+        'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias',
+        'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight',
+        'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias',
+        'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight',
+        'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias',
+        'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight',
+        'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias',
+        'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
+        'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
+    }
+    
+    for layer in range(config.encoder_layers):
+        # deformable
+        for src, dest in deformable_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+        # text enhance
+        for src, dest in text_enhancer_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+        # fusion layers
+        for src, dest in fusion_key_mappings.items():
+            rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", 
+                                f"model.encoder.layers.{layer}.{dest}"))
+    ########################################## ENCODER - END
+
+    #TODO convert decoder
+    ########################################## DECODER - START
+    ########################################## DECODER - END
+
+    #TODO convert head
+    ########################################## HEAD - START
+    ########################################## HEAD - END
+
+    #TODO convert additional layers
+    ########################################## Additional - START
+    ########################################## Additional - END
+
     # fmt: on
     return rename_keys
 
@@ -259,6 +341,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     # Rename keys
     new_state_dict = original_state_dict.copy()
     rename_keys = create_rename_keys(original_state_dict, config)
+    
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)

From 0512f7a286d311617a222b77b841c6835a19b3aa Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 5 Sep 2023 16:10:51 -0300
Subject: [PATCH 049/252] Modified Decoder Layer

---
 .../grounding_dino/modeling_grounding_dino.py | 51 ++++++++++++++-----
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 229c5d89c716f9..9f6edac849f2c9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1388,7 +1388,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.embed_dim = config.d_model
 
         # self-attention
-        self.self_attn = GroundingDINOMultiheadAttention(
+        self.self_attn = nn.MultiheadAttention(
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -1398,6 +1398,13 @@ def __init__(self, config: GroundingDINOConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention text
+        self.encoder_attn_text = nn.MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
         self.encoder_attn = GroundingDINOMultiscaleDeformableAttention(
             config,
@@ -1410,6 +1417,9 @@ def __init__(self, config: GroundingDINOConfig):
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1417,8 +1427,11 @@ def forward(
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
+        vision_encoder_hidden_states: Optional[torch.Tensor] = None,
+        vision_encoder_attention_mask: Optional[torch.Tensor] = None,
+        text_encoder_hidden_states: Optional[torch.Tensor] = None,
+        text_encoder_attention_mask: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ):
         """
@@ -1446,9 +1459,10 @@ def forward(
 
         # Self Attention
         hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            output_attentions=output_attentions,
+            query=self.with_pos_embed(hidden_states, position_embeddings),
+            key=self.with_pos_embed(hidden_states, position_embeddings),
+            value=hidden_states,
+            attn_mask=self_attn_mask
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1457,13 +1471,27 @@ def forward(
 
         second_residual = hidden_states
 
+        # Cross-Attention Text
+        hidden_states, text_cross_attn_weights = self.encoder_attn_text(
+            query=self.with_pos_embed(hidden_states, position_embeddings),
+            key=text_encoder_hidden_states.transpose(0, 1),
+            value=text_encoder_hidden_states.transpose(0, 1),
+            attn_mask=text_encoder_attention_mask,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+        hidden_states = self.encoder_attn_text_layer_norm(hidden_states)
+
+        third_residual = hidden_states
+
         # Cross-Attention
         cross_attn_weights = None
         hidden_states, cross_attn_weights = self.encoder_attn(
             hidden_states=hidden_states,
-            attention_mask=encoder_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
+            attention_mask=vision_encoder_attention_mask,
+            encoder_hidden_states=vision_encoder_hidden_states,
+            encoder_attention_mask=vision_encoder_attention_mask,
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
@@ -1472,8 +1500,7 @@ def forward(
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = second_residual + hidden_states
-
+        hidden_states = third_residual + hidden_states
         hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
         # Fully Connected
@@ -1488,7 +1515,7 @@ def forward(
         outputs = (hidden_states,)
 
         if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
+            outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights)
 
         return outputs
 

From d2cd35f204b12257250ed7db9f004af93b7dfc7b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 6 Sep 2023 14:33:57 -0300
Subject: [PATCH 050/252] Modified main decoder class

---
 .../configuration_grounding_dino.py           |  6 +--
 .../convert_grounding_dino_to_hf.py           | 37 ++++++++++++++
 .../grounding_dino/modeling_grounding_dino.py | 49 +++++++++++++------
 3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 14e82704cb495b..33de7c666cef19 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -163,7 +163,7 @@ def __init__(
         encoder_ffn_dim=2048,
         encoder_attention_heads=8,
         decoder_layers=6,
-        decoder_ffn_dim=1024,
+        decoder_ffn_dim=2048,
         decoder_attention_heads=8,
         encoder_layerdrop=0.0,
         is_encoder_decoder=True,
@@ -183,9 +183,9 @@ def __init__(
         num_feature_levels=4,
         encoder_n_points=4,
         decoder_n_points=4,
-        two_stage=False,
+        two_stage=True,
         two_stage_num_proposals=300,
-        with_box_refine=False,
+        with_box_refine=True,
         class_cost=1,
         bbox_cost=5,
         giou_cost=2,
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index f9fc7e87d12bba..846892980d2d21 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -203,6 +203,43 @@ def create_rename_keys(state_dict, config):
 
     #TODO convert decoder
     ########################################## DECODER - START
+    key_mappings_decoder = {
+        'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
+        'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias',
+        'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight',
+        'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias',
+        'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight',
+        'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias',
+        'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight',
+        'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias',
+        'norm1.weight': 'encoder_attn_layer_norm.weight',
+        'norm1.bias': 'encoder_attn_layer_norm.bias',
+        'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight',
+        'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias',
+        'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight',
+        'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias',
+        'catext_norm.weight': 'encoder_attn_text_layer_norm.weight',
+        'catext_norm.bias': 'encoder_attn_text_layer_norm.bias',
+        'self_attn.in_proj_weight': 'self_attn.in_proj_weight',
+        'self_attn.in_proj_bias': 'self_attn.in_proj_bias',
+        'self_attn.out_proj.weight': 'self_attn.out_proj.weight',
+        'self_attn.out_proj.bias': 'self_attn.out_proj.bias',
+        'norm2.weight': 'self_attn_layer_norm.weight',
+        'norm2.bias': 'self_attn_layer_norm.bias',
+        'linear1.weight': 'fc1.weight',
+        'linear1.bias': 'fc1.bias',
+        'linear2.weight': 'fc2.weight',
+        'linear2.bias': 'fc2.bias',
+        'norm3.weight': 'final_layer_norm.weight',
+        'norm3.bias': 'final_layer_norm.bias',
+    }
+    for layer_num in range(config.decoder_layers):
+        source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.'
+        target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
+
+        for source_name, target_name in key_mappings_decoder.items():
+            rename_keys.append((source_prefix_decoder + source_name, 
+                               target_prefix_decoder + target_name))
     ########################################## DECODER - END
 
     #TODO convert head
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 9f6edac849f2c9..d57e823199703a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -160,10 +160,14 @@ class GroundingDINODecoderOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
             the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
             used to compute the weighted average in the cross-attention heads.
+        text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the text cross-attention heads.
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -171,7 +175,8 @@ class GroundingDINODecoderOutput(ModelOutput):
     intermediate_reference_points: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 @dataclass
 class GroundingDINOEncoderOutput(ModelOutput):
@@ -1814,7 +1819,6 @@ def forward(
             attentions_text=all_attn_enhanced_text
         )
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
@@ -1840,20 +1844,24 @@ def __init__(self, config: GroundingDINOConfig):
         # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
         self.bbox_embed = None
         self.class_embed = None
+        self.query_scale = None
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
+        inputs_embeds,
+        vision_encoder_hidden_states,
+        vision_encoder_attention_mask=None,
+        text_encoder_hidden_states=None,
+        text_encoder_attention_mask=None,
         position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
         valid_ratios=None,
+        self_attn_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1902,7 +1910,8 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
         intermediate = ()
         intermediate_reference_points = ()
 
@@ -1930,20 +1939,23 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
                     None,
                 )
             else:
                 layer_outputs = decoder_layer(
-                    hidden_states,
+                    hidden_states=hidden_states,
                     position_embeddings=position_embeddings,
-                    encoder_hidden_states=encoder_hidden_states,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
                     level_start_index=level_start_index,
-                    encoder_attention_mask=encoder_attention_mask,
-                    output_attentions=output_attentions,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions
                 )
 
             hidden_states = layer_outputs[0]
@@ -1970,8 +1982,12 @@ def custom_forward(*inputs):
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
 
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
@@ -2000,7 +2016,8 @@ def custom_forward(*inputs):
             intermediate_reference_points=intermediate_reference_points,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
+            vision_cross_attentions=all_cross_attns_vision,
+            text_cross_attentions=all_cross_attns_text
         )
 
 

From cb2ad7f51fc32eab1274909fed59811289d2e34e Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 6 Sep 2023 14:38:56 -0300
Subject: [PATCH 051/252] Removed copy comments

---
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d57e823199703a..8cd584c1fcc71c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -137,7 +137,6 @@ def backward(context, grad_output):
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO
 class GroundingDINODecoderOutput(ModelOutput):
     """
     Base class for outputs of the GroundingDINODecoder. This class adds two attributes to
@@ -1153,7 +1152,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
-    
 class GroundingDINOFusionLayer(nn.Module):
     def __init__(self, config, init_values=1e-4):
         super().__init__()
@@ -1386,7 +1384,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO
 class GroundingDINODecoderLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
@@ -2006,7 +2003,8 @@ def custom_forward(*inputs):
                     intermediate_reference_points,
                     all_hidden_states,
                     all_self_attns,
-                    all_cross_attentions,
+                    all_cross_attns_vision,
+                    all_cross_attns_text
                 ]
                 if v is not None
             )

From eaf958d1d3eb38c411d731cf7646cd30bcf22262 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 10 Sep 2023 23:21:17 -0300
Subject: [PATCH 052/252] Fixed forward from GroundingDINOModel and
 GroundingDINODecoder

---
 .../configuration_grounding_dino.py           |  14 ++
 .../convert_grounding_dino_to_hf.py           |   9 +
 .../grounding_dino/modeling_grounding_dino.py | 190 +++++++++++++-----
 3 files changed, 162 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 33de7c666cef19..bc43655df050ee 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -130,6 +130,18 @@ class GroundingDINOConfig(PretrainedConfig):
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
             Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
             kernels are not supported by PyTorch ONNX export.
+        max_text_len (`int`, *optional*, defaults to 256):
+            The maximum length of the text input.
+        sub_sentence_present (`bool`, *optional*, defaults to `True`):
+            Whether to use sub-sentence present in the text input.
+        text_enhancer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the text enhancer.
+        fusion_droppath (`float`, *optional*, defaults to 0.1):
+            The droppath ratio for the fusion module.
+        fusion_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the fusion module.
+        embedding_init_target (`bool`, *optional*, defaults to `True`):
+            Whether to initialize the target with Embedding weights.
 
     Examples:
 
@@ -202,6 +214,7 @@ def __init__(
         text_enhancer_dropout = 0.0,
         fusion_droppath = 0.1,
         fusion_dropout = 0.0,
+        embedding_init_target = True,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -269,6 +282,7 @@ def __init__(
         # Fusion
         self.fusion_droppath = fusion_droppath
         self.fusion_dropout = fusion_dropout
+        self.embedding_init_target = embedding_init_target
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 846892980d2d21..efced9cba0d522 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -248,6 +248,15 @@ def create_rename_keys(state_dict, config):
 
     #TODO convert additional layers
     ########################################## Additional - START
+    for layer_name, params in state_dict.items():
+        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
+        if "module.input_proj" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision")))
+            #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
+        if "module.feat_map" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text")))
+    #### 
+
     ########################################## Additional - END
 
     # fmt: on
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8cd584c1fcc71c..35ed14fa6859bc 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1521,6 +1521,27 @@ def forward(
 
         return outputs
 
+class GroundingDINOContrastiveEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_text_len = config.max_text_len
+
+    def forward(
+            self, 
+            vision_hidden_state: torch.FloatTensor, 
+            text_hiddend_state: torch.FloatTensor, 
+            text_token_mask: torch.BoolTensor
+        ) -> torch.FloatTensor:
+
+
+        output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2)
+        output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device)
+        new_output[..., : output.shape[-1]] = output
+
+        return new_output
 
 # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
 class GroundingDINOClassificationHead(nn.Module):
@@ -1836,6 +1857,12 @@ def __init__(self, config: GroundingDINOConfig):
 
         self.dropout = config.dropout
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = GroundingDINOMLPPredictionHead(
+            config.query_dim // 2 * config.d_model,
+            config.d_model,
+            config.d_model,
+            2
+        )
         self.gradient_checkpointing = False
 
         # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
@@ -1846,6 +1873,45 @@ def __init__(self, config: GroundingDINOConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTensor:
+        """Get the position embedding of the proposals."""
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries
+        pos_x = proposals[:, :, 0] * scale
+        pos_y = proposals[:, :, 1] * scale
+        # batch_size, num_queries, num_pos_feats
+        pos_x = pos_x[:, :, None] / dim_t
+        pos_y = pos_y[:, :, None] / dim_t
+        # batch_size, num_queries, num_pos_feats 
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        if proposals.size(-1) == 2:
+            # batch_size, num_queries, num_pos_feats * 2
+            pos = torch.cat((pos_y, pos_x), dim=2)
+        elif proposals.size(-1) == 4:
+            w_embed = proposals[:, :, 2] * scale
+            pos_w = w_embed[:, :, None] / dim_t
+            # batch_size, num_queries, num_pos_feats
+            pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+            h_embed = proposals[:, :, 3] * scale
+            pos_h = h_embed[:, :, None] / dim_t
+            # batch_size, num_queries, num_pos_feats
+            pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+            # batch_size, num_queries, num_pos_feats * 4
+            pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+        else:
+            raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1)))
+        return pos
+
+
+
     def forward(
         self,
         inputs_embeds,
@@ -1853,7 +1919,6 @@ def forward(
         vision_encoder_attention_mask=None,
         text_encoder_hidden_states=None,
         text_encoder_attention_mask=None,
-        position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
@@ -1875,8 +1940,6 @@ def forward(
                 in `[0, 1]`:
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
             reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
                 Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
             spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
@@ -1921,6 +1984,8 @@ def forward(
                 if reference_points.shape[-1] != 2:
                     raise ValueError("Reference points' last dimension must be of size 2")
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
+            query_pos = self.reference_points_head(query_pos)
 
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1943,7 +2008,7 @@ def custom_forward(*inputs):
             else:
                 layer_outputs = decoder_layer(
                     hidden_states=hidden_states,
-                    position_embeddings=position_embeddings,
+                    position_embeddings=query_pos,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
                     level_start_index=level_start_index,
@@ -2034,8 +2099,6 @@ def __init__(self, config: GroundingDINOConfig):
         backbone = GroundingDINOConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
-        # Create text backbone
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -2057,9 +2120,9 @@ def __init__(self, config: GroundingDINOConfig):
                     )
                 )
                 in_channels = config.d_model
-            self.input_proj = nn.ModuleList(input_proj_list)
+            self.input_proj_vision = nn.ModuleList(input_proj_list)
         else:
-            self.input_proj = nn.ModuleList(
+            self.input_proj_vision = nn.ModuleList(
                 [
                     nn.Sequential(
                         nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
@@ -2068,8 +2131,12 @@ def __init__(self, config: GroundingDINOConfig):
                 ]
             )
 
-        if not config.two_stage:
-            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+        # Create text backbone
+        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
+        self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
+
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
         self.encoder = GroundingDINOEncoder(config)
         self.decoder = GroundingDINODecoder(config)
@@ -2079,10 +2146,8 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
-            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
-            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
         else:
-            self.reference_points = nn.Linear(config.d_model, 2)
+            self.reference_points = nn.Embedding(config.num_queries, 4)
 
         self.post_init()
 
@@ -2164,6 +2229,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
             proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
             _cur += height * width
+
         output_proposals = torch.cat(proposals, 1)
         output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
         output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
@@ -2181,12 +2247,15 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
     @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        token_type_ids: Tensor,
+        text_token_mask: Tensor,
+        text_self_attention_masks: Tensor,
+        position_ids: Tensor,
+        pixel_mask: Optional[Tensor]=None,
         encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -2221,6 +2290,10 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # Extract text features from text backbone
+        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"]
+        text_features = self.input_proj_text(text_features)
+
         batch_size, num_channels, height, width = pixel_values.shape
         device = pixel_values.device
 
@@ -2230,13 +2303,13 @@ def forward(
         # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
         # First, sent pixel_values + pixel_mask through Backbone to obtain the features
         # which is a list of tuples
-        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
 
         # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         sources = []
         masks = []
-        for level, (source, mask) in enumerate(features):
-            sources.append(self.input_proj[level](source))
+        for level, (source, mask) in enumerate(vision_features):
+            sources.append(self.input_proj_vision[level](source))
             masks.append(mask)
             if mask is None:
                 raise ValueError("No attention mask was provided")
@@ -2246,9 +2319,9 @@ def forward(
             _len_sources = len(sources)
             for level in range(_len_sources, self.config.num_feature_levels):
                 if level == _len_sources:
-                    source = self.input_proj[level](features[-1][0])
+                    source = self.input_proj_vision[level](vision_features[-1][0])
                 else:
-                    source = self.input_proj[level](sources[-1])
+                    source = self.input_proj_vision[level](sources[-1])
                 mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
                 pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
                 sources.append(source)
@@ -2257,7 +2330,7 @@ def forward(
 
         # Create queries
         query_embeds = None
-        if not self.config.two_stage:
+        if self.config.embedding_init_target or self.config.two_stage:
             query_embeds = self.query_position_embeddings.weight
 
         # Prepare encoder inputs (by flattening)
@@ -2288,26 +2361,35 @@ def forward(
         # Also provide spatial_shapes, level_start_index and valid_ratios
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
-                inputs_embeds=source_flatten,
-                attention_mask=mask_flatten,
-                position_embeddings=lvl_pos_embed_flatten,
+                vision_features=source_flatten,
+                vision_attention_mask=mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=position_ids,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=return_dict
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput):
+            encoder_outputs = GroundingDINOEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
+                attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None,
+                cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None,
+                cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None,
             )
 
         # Fifth, prepare decoder inputs
-        batch_size, _, num_channels = encoder_outputs[0].shape
         enc_outputs_class = None
         enc_outputs_coord_logits = None
         if self.config.two_stage:
@@ -2318,14 +2400,19 @@ def forward(
             # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            enc_outputs_class = self.decoder.class_embed[-1](
+                object_query_embedding, 
+                encoder_outputs[1], 
+                text_token_mask
+            )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
             delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
             enc_outputs_coord_logits = delta_bbox + output_proposals
 
             # only keep top scoring `config.two_stage_num_proposals` proposals
             topk = self.config.two_stage_num_proposals
-            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
             topk_coords_logits = torch.gather(
                 enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
             )
@@ -2333,27 +2420,31 @@ def forward(
             topk_coords_logits = topk_coords_logits.detach()
             reference_points = topk_coords_logits.sigmoid()
             init_reference_points = reference_points
-            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
-            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+            if query_embeds:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
         else:
-            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
-            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
-            target = target.unsqueeze(0).expand(batch_size, -1, -1)
-            reference_points = self.reference_points(query_embed).sigmoid()
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
             init_reference_points = reference_points
 
         decoder_outputs = self.decoder(
             inputs_embeds=target,
-            position_embeddings=query_embed,
-            encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=mask_flatten,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=text_token_mask,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
             level_start_index=level_start_index,
             valid_ratios=valid_ratios,
+            self_attn_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=return_dict
         )
 
         if not return_dict:
@@ -2396,14 +2487,11 @@ def __init__(self, config: GroundingDINOConfig):
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
-        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.class_embed = GroundingDINOContrastiveEmbedding(config)
         self.bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
-        prior_prob = 0.01
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
         nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
         nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
 

From 88d07b3a3c293c8102f8f2ac3c2768985427cbc4 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 11 Sep 2023 23:40:10 -0300
Subject: [PATCH 053/252] Added all necessary layers, configurations and
 forward logic up to GroundingDINOModel

---
 .../configuration_grounding_dino.py           | 19 +++++++
 .../grounding_dino/modeling_grounding_dino.py | 52 +++++++++++--------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index bc43655df050ee..e413d43b55cd89 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -142,6 +142,14 @@ class GroundingDINOConfig(PretrainedConfig):
             The dropout ratio for the fusion module.
         embedding_init_target (`bool`, *optional*, defaults to `True`):
             Whether to initialize the target with Embedding weights.
+        query_dim (`int`, *optional*, defaults to 4):
+            The dimension of the query vector.
+        decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
+            Whether to share the bbox embedding between the decoder and the two-stage bbox generator.
+        two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
+            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
+        two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
+            Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
 
     Examples:
 
@@ -215,6 +223,10 @@ def __init__(
         fusion_droppath = 0.1,
         fusion_dropout = 0.0,
         embedding_init_target = True,
+        query_dim = 4,
+        decoder_bbox_embed_share = True,
+        two_stage_bbox_embed_share = False,
+        two_stage_class_embed_share = False,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -282,7 +294,14 @@ def __init__(
         # Fusion
         self.fusion_droppath = fusion_droppath
         self.fusion_dropout = fusion_dropout
+        # Others
         self.embedding_init_target = embedding_init_target
+        self.query_dim = query_dim
+        self.decoder_bbox_embed_share = decoder_bbox_embed_share
+        self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
+        if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
+            raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
+        self.two_stage_class_embed_share = two_stage_class_embed_share
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 35ed14fa6859bc..4c35a8cf4b7814 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1856,6 +1856,7 @@ def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
+        self.layer_norm = nn.LayerNorm(config.d_model)
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.reference_points_head = GroundingDINOMLPPredictionHead(
             config.query_dim // 2 * config.d_model,
@@ -2038,7 +2039,7 @@ def custom_forward(*inputs):
                     new_reference_points = new_reference_points.sigmoid()
                 reference_points = new_reference_points.detach()
 
-            intermediate += (hidden_states,)
+            intermediate += (self.layer_norm(hidden_states),)
             intermediate_reference_points += (reference_points,)
 
             if output_attentions:
@@ -2146,6 +2147,8 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.encoder_output_bbox_embed = None
+            self.encoder_output_class_embed = None
         else:
             self.reference_points = nn.Embedding(config.num_queries, 4)
 
@@ -2400,13 +2403,13 @@ def forward(
             # hack implementation for two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.decoder.class_embed[-1](
+            enc_outputs_class = self.encoder_output_class_embed(
                 object_query_embedding, 
                 encoder_outputs[1], 
                 text_token_mask
             )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
-            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
             enc_outputs_coord_logits = delta_bbox + output_proposals
 
             # only keep top scoring `config.two_stage_num_proposals` proposals
@@ -2487,32 +2490,35 @@ def __init__(self, config: GroundingDINOConfig):
         self.model = GroundingDINOModel(config)
 
         # Detection heads on top
-        self.class_embed = GroundingDINOContrastiveEmbedding(config)
-        self.bbox_embed = GroundingDINOMLPPredictionHead(
+        _class_embed = GroundingDINOContrastiveEmbedding(config)
+        _bbox_embed = GroundingDINOMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
-        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
-        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
 
-        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
-        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
-        if config.with_box_refine:
-            self.class_embed = _get_clones(self.class_embed, num_pred)
-            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
-            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
-            # hack implementation for iterative bounding box refinement
-            self.model.decoder.bbox_embed = self.bbox_embed
+
+        if config.decoder_bbox_embed_share:
+            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
-            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
-            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
-            self.model.decoder.bbox_embed = None
+            self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
+        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
+        # hack implementation for two-stage 
+        self.model.decoder.bbox_embed = self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
+
         if config.two_stage:
-            # hack implementation for two-stage
-            self.model.decoder.class_embed = self.class_embed
-            for box_embed in self.bbox_embed:
-                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+            if config.two_stage_bbox_embed_share:
+                self.model.encoder_output_bbox_embed = _bbox_embed
+            else:
+                self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed)
+            
+            #TODO don't believe this is necessary since class_embed has no parameters
+            if config.two_stage_class_embed_share:
+                self.model.encoder_output_class_embed = _class_embed
+            else:
+                self.model.encoder_output_class_embed = copy.deepcopy(_class_embed)
 
         # Initialize weights and apply final processing
         self.post_init()

From f17bd3d6e5d6413613e24ee1777308c130523081 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 12 Sep 2023 00:16:28 -0300
Subject: [PATCH 054/252] Added all layers to convertion

---
 .../convert_grounding_dino_to_hf.py           | 101 ++++++++++--------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index efced9cba0d522..4c74404b19b288 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -66,72 +66,66 @@ def create_rename_keys(state_dict, config):
     #TODO names might change after modifing GroundingDINOModel class
     ########################################## VISION BACKBONE - START
     # patch embedding layer
-    rename_keys.append(("module.backbone.0.patch_embed.proj.weight", 
+    rename_keys.append(("backbone.0.patch_embed.proj.weight", 
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("module.backbone.0.patch_embed.proj.bias", 
+    rename_keys.append(("backbone.0.patch_embed.proj.bias", 
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("module.backbone.0.patch_embed.norm.weight", 
+    rename_keys.append(("backbone.0.patch_embed.norm.weight", 
                         "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("module.backbone.0.patch_embed.norm.bias", 
+    rename_keys.append(("backbone.0.patch_embed.norm.bias", 
                         "model.backbone.conv_encoder.model.embeddings.norm.bias"))
 
     for layer, depth in enumerate(config.backbone_config.depths):
         for block in range(depth):
             # layernorms
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
             
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
                                 f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
             #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
             # intermidiate
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
             
             # output
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
             
         # downsample
         if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
     
     for out_indice in config.backbone_config.out_indices:
         # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", 
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", 
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
         
     ########################################## VISION BACKBONE - END
 
-    ########################################## TEXT BACKBONE - START
-    for layer_name, params in state_dict.items():
-        if "module.bert" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone")))
-    ########################################## TEXT BACKBONE - END
-
     ########################################## ENCODER - START
     deformable_key_mappings = {
         'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
@@ -185,23 +179,21 @@ def create_rename_keys(state_dict, config):
         'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
         'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
     }
-    
     for layer in range(config.encoder_layers):
         # deformable
         for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
         # text enhance
         for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
         # fusion layers
         for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", 
                                 f"model.encoder.layers.{layer}.{dest}"))
     ########################################## ENCODER - END
 
-    #TODO convert decoder
     ########################################## DECODER - START
     key_mappings_decoder = {
         'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
@@ -234,7 +226,7 @@ def create_rename_keys(state_dict, config):
         'norm3.bias': 'final_layer_norm.bias',
     }
     for layer_num in range(config.decoder_layers):
-        source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.'
+        source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.'
         target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
 
         for source_name, target_name in key_mappings_decoder.items():
@@ -246,17 +238,36 @@ def create_rename_keys(state_dict, config):
     ########################################## HEAD - START
     ########################################## HEAD - END
 
-    #TODO convert additional layers
     ########################################## Additional - START
     for layer_name, params in state_dict.items():
+        #### TEXT BACKBONE 
+        if "bert" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
         #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
-        if "module.input_proj" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision")))
-            #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
-        if "module.feat_map" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text")))
-    #### 
-
+        if "input_proj" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision")))
+        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
+        if "feat_map" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text")))
+        #### DECODER REFERENCE POINT HEAD
+        if "transformer.decoder.ref_point_head" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", 
+                                                               "model.decoder.reference_points_head")))
+        #### DECODER BBOX EMBED
+        if "transformer.decoder.bbox_embed" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", 
+                                                               "model.decoder.bbox_embed")))
+        if "transformer.enc_output" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
+        
+        if "transformer.enc_out_bbox_embed" in layer_name:
+            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", 
+                                                               "model.encoder_output_bbox_embed")))
+            
+    rename_keys.append(("transformer.level_embed", "model.level_embed"))
+    rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
+    rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
+    rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight"))
     ########################################## Additional - END
 
     # fmt: on
@@ -274,8 +285,8 @@ def read_in_q_k_v(state_dict, config):
         hidden_size = embed_dim * 2**layer
         for block in range(depth):
             # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
+            in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
+            in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
             # next, add query, keys and values (in that order) to the state dict
             state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
             state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
@@ -382,7 +393,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
-    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")
 
     # Rename keys
     new_state_dict = original_state_dict.copy()
@@ -452,7 +463,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
     parser.add_argument(
         "--checkpoint_path",
-        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth",
+        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
         type=str,
         help="Path to the original PyTorch checkpoint (.pth file).",
     )

From dcd1990175d41d3574be4fc23629661a3ca5868a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 12 Sep 2023 11:24:24 -0300
Subject: [PATCH 055/252] Fixed outputs for GroundingDINOModel and
 GroundingDINOForObjectDetection

---
 .../grounding_dino/modeling_grounding_dino.py | 156 +++++++++++++-----
 1 file changed, 113 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4c35a8cf4b7814..c3d094285dcf0d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -228,10 +228,9 @@ class GroundingDINOEncoderOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
 class GroundingDINOModelOutput(ModelOutput):
     """
-    Base class for outputs of the Deformable DETR encoder-decoder model.
+    Base class for outputs of the Grounding DINO encoder-decoder model.
 
     Args:
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
@@ -250,25 +249,47 @@ class GroundingDINOModelOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
             num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
             average in the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+        encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
             layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
             foreground and background).
-        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
 
@@ -278,16 +299,21 @@ class GroundingDINOModelOutput(ModelOutput):
     intermediate_reference_points: torch.FloatTensor = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
 @dataclass
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO
 class GroundingDINOObjectDetectionOutput(ModelOutput):
     """
     Output type of [`GroundingDINOForObjectDetection`].
@@ -320,20 +346,42 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
             num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
             average in the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+        encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
             layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
-            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
-            in the self-attention heads.
+        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
+            the multi-scale deformable attention heads.
+        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
+        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
+            used to compute the weighted average in the bi-attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -359,12 +407,18 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
     intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    enc_outputs_class: Optional = None
-    enc_outputs_coord_logits: Optional = None
+    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
 def _get_clones(module, N):
@@ -1988,8 +2042,11 @@ def forward(
             query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
             query_pos = self.reference_points_head(query_pos)
 
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # withtout layer norm
             if output_hidden_states:
-                all_hidden_states += (hidden_states,)
+                all_hidden_states += (self.layer_norm(hidden_states),)
 
             if self.gradient_checkpointing and self.training:
 
@@ -2055,6 +2112,7 @@ def custom_forward(*inputs):
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
         intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -2463,10 +2521,16 @@ def forward(
             intermediate_reference_points=decoder_outputs.intermediate_reference_points,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
+            decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions,
+            decoder_cross_attentions_text=decoder_outputs.text_cross_attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
+            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
+            encoder_attentions_vision=encoder_outputs.attentions_vision,
+            encoder_attentions_text=encoder_outputs.attentions_text,
+            encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision,
+            encoder_cross_attentions_text=encoder_outputs.cross_attentions_text,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
         )
@@ -2588,7 +2652,7 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
             pixel_values,
             pixel_mask=pixel_mask,
@@ -2688,10 +2752,16 @@ def forward(
             last_hidden_state=outputs.last_hidden_state,
             decoder_hidden_states=outputs.decoder_hidden_states,
             decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
+            decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision,
+            decoder_cross_attentions_text=outputs.decoder_cross_attentions_text,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
+            encoder_hidden_states_text=outputs.encoder_hidden_states_text,
+            encoder_attentions_vision=outputs.encoder_attentions_vision,
+            encoder_attentions_text=outputs.encoder_attentions_text,
+            encoder_cross_attentions_text=outputs.encoder_cross_attentions_text,
+            encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision,
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
             init_reference_points=outputs.init_reference_points,

From 39a161c86a2867ffec0cc0e0bf7cdd0d4229bd7d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 11:58:02 -0300
Subject: [PATCH 056/252] Fixed mask input to encoders and fixed
 nn.MultiheadAttention batch first and attn output

---
 .../convert_grounding_dino_to_hf.py           | 30 ++++-----
 .../grounding_dino/modeling_grounding_dino.py | 61 ++++++++++++-------
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 4c74404b19b288..15793a0df03ae7 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -385,7 +385,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         tokenized_for_encoder["attention_mask"] = text_self_attention_masks
         tokenized_for_encoder["position_ids"] = position_ids
 
-    return tokenized_for_encoder
+    return tokenized_for_encoder, tokenized.attention_mask.bool()
 
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
@@ -418,25 +418,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         ]
     )
     image_inputs = image_processor(image)
-    text_inputs = text_processor(text, config)
-
-    pixel_mask = torch.ones(
-        ((1, image_inputs.shape[1], image_inputs.shape[2])), 
-        dtype=torch.long, 
-        device=image_inputs.device
+    text_inputs, text_token_mask = text_processor(text, config)
+
+    outputs = model(
+        pixel_values=image_inputs.unsqueeze(0),
+        input_ids=text_inputs["input_ids"],
+        attention_mask=text_inputs["attention_mask"],
+        token_type_ids=text_inputs["token_type_ids"],
+        text_token_mask=text_token_mask,
+        text_self_attention_masks=text_inputs["attention_mask"],
+        position_ids=text_inputs["position_ids"],
     )
-    # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0))
-    output = model.model.text_backbone(**text_inputs)
-    print(output.last_hidden_state[:, :, :5])
-
-    # for feature_map in output.last_hidden_state:
-    #     print(f"{feature_map.shape}")
-    #     print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}")
-
-    # outputs = model(**inputs).logits
-
-    # print(outputs.keys())
-    # print("Looks ok!")
 
     # if pytorch_dump_folder_path is not None:
     #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index c3d094285dcf0d..2cc715b10cce4f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -970,7 +970,8 @@ def __init__(self, config):
         self.self_attn = nn.MultiheadAttention(
             embed_dim=config.d_model, 
             num_heads=config.encoder_attention_heads // 2, 
-            dropout=config.text_enhancer_dropout
+            dropout=config.text_enhancer_dropout,
+            batch_first=True,
             )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
@@ -999,7 +1000,13 @@ def forward(
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
-        attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)
+        attention_output, attention_weights = self.self_attn(
+            query=q, 
+            key=k, 
+            value=hidden_states, 
+            attn_mask=attention_masks,
+            average_attn_weights=False
+        )
 
         hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
@@ -1233,8 +1240,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
             vision_features, 
             text_features, 
-            attention_mask_vision=attention_mask_vision, 
-            attention_mask_text=attention_mask_text
+            vision_attention_mask=attention_mask_vision, 
+            text_attention_mask=attention_mask_text
         )
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
         text_features = text_features + self.drop_path(self.gamma_l * delta_t)
@@ -1448,6 +1455,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
+            batch_first=True
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1459,6 +1467,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
+            batch_first=True
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
@@ -1518,7 +1527,8 @@ def forward(
             query=self.with_pos_embed(hidden_states, position_embeddings),
             key=self.with_pos_embed(hidden_states, position_embeddings),
             value=hidden_states,
-            attn_mask=self_attn_mask
+            attn_mask=self_attn_mask,
+            average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1533,6 +1543,7 @@ def forward(
             key=text_encoder_hidden_states.transpose(0, 1),
             value=text_encoder_hidden_states.transpose(0, 1),
             attn_mask=text_encoder_attention_mask,
+            average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -2423,13 +2434,13 @@ def forward(
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 vision_features=source_flatten,
-                vision_attention_mask=mask_flatten,
+                vision_attention_mask=~mask_flatten,
                 vision_position_embedding=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
                 text_features=text_features,
-                text_attention_mask=text_token_mask,
+                text_attention_mask=~text_token_mask,
                 text_position_embedding=None,
                 text_self_attention_masks=text_self_attention_masks,
                 text_position_ids=position_ids,
@@ -2599,16 +2610,19 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.BoolTensor,
+        token_type_ids: torch.LongTensor,
+        text_token_mask: torch.BoolTensor,
+        text_self_attention_masks: torch.BoolTensor,
+        position_ids: torch.LongTensor,
+        pixel_mask: Optional[torch.BoolTensor]=None,
+        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None,
+        output_attentions: Optional[bool]=None,
+        output_hidden_states: Optional[bool]=None,
+        return_dict: Optional[bool]=None,
     ):
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
@@ -2654,12 +2668,15 @@ def forward(
 
         # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
-            pixel_values,
-            pixel_mask=pixel_mask,
-            decoder_attention_mask=decoder_attention_mask,
+            pixel_values=pixel_values ,
+            input_ids=input_ids ,
+            attention_mask=attention_mask ,
+            token_type_ids=token_type_ids ,
+            text_token_mask=text_token_mask ,
+            text_self_attention_masks=text_self_attention_masks ,
+            position_ids=position_ids ,
+            pixel_mask=pixel_mask ,
             encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,

From 5ec72fb47b27a3101ca5c087b5f2ccf49621da18 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 14:14:57 -0300
Subject: [PATCH 057/252] Fixed forward from GroundingDINOTextEnhancerLayer

---
 .../grounding_dino/modeling_grounding_dino.py | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2cc715b10cce4f..36822d53eaa9ab 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -975,16 +975,14 @@ def __init__(self, config):
             )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
-        self.dropout = nn.Dropout(config.text_enhancer_dropout)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
 
         self.layer_norm_before = nn.LayerNorm(config.d_model)
         self.layer_norm_after = nn.LayerNorm(config.d_model)
-        self.dropout1 = nn.Dropout(config.text_enhancer_dropout)
-        self.dropout2 = nn.Dropout(config.text_enhancer_dropout)
 
         self.activation = ACT2FN[config.activation_function]
         self.num_heads = config.encoder_attention_heads // 2
+        self.dropout = config.text_enhancer_dropout
 
     def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
         return hidden_state if position_embeddings is None else hidden_state + position_embeddings
@@ -995,7 +993,7 @@ def forward(
         attention_masks: Optional[Tensor] = None,
         position_embeddings: Optional[Tensor] = None,
     ):    # repeat attn mask
-        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]:
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
@@ -1007,13 +1005,18 @@ def forward(
             attn_mask=attention_masks,
             average_attn_weights=False
         )
+        attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + attention_output
+        residual = hidden_states
 
-        hidden_states = hidden_states + self.dropout1(attention_output)
         hidden_states = self.layer_norm_before(hidden_states)
         hidden_states = self.activation(self.fc1(hidden_states))
-        attention_output = self.fc2(self.dropout(hidden_states))
-        hidden_states = hidden_states + self.dropout2(attention_output)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + residual
         hidden_states = self.layer_norm_after(hidden_states)
+
         return hidden_states, attention_weights
     
 class GroundingDINOBiMultiHeadAttention(nn.Module):
@@ -1423,12 +1426,10 @@ def forward(
         )
 
         (text_features, text_enhanced_attn) = self.text_enhancer_layer(
-            hidden_states=text_features.transpose(0, 1),
+            hidden_states=text_features,
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(
-                text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None
-            ),
-        ).transpose(0, 1)
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None)
+        )
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,

From 086f68a70351c826b01e2c25efc9bd5d8187c44b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 13 Sep 2023 14:31:17 -0300
Subject: [PATCH 058/252] Fixed output bug with GroundingDINODeformableLayer

---
 .../models/grounding_dino/modeling_grounding_dino.py       | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 36822d53eaa9ab..e8e147cb00554a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1329,12 +1329,7 @@ def forward(
                 clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
+        return hidden_states, attn_weights
 
 def get_sine_pos_embed(
     pos_tensor: torch.Tensor,

From f75cda2f12466d3e281eb6e4bf5d24f8f7dd8d8a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 15 Sep 2023 18:57:37 -0300
Subject: [PATCH 059/252] Fixed bugs that prevent
 GroundingDINOForObjectDetection to run forward method

---
 .../configuration_grounding_dino.py           |  2 +-
 .../grounding_dino/modeling_grounding_dino.py | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e413d43b55cd89..3a62780362d834 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -204,7 +204,7 @@ def __init__(
         encoder_n_points=4,
         decoder_n_points=4,
         two_stage=True,
-        two_stage_num_proposals=300,
+        two_stage_num_proposals=900,
         with_box_refine=True,
         class_cost=1,
         bbox_cost=5,
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e8e147cb00554a..2e9d7d3d0de7f5 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1536,9 +1536,9 @@ def forward(
         # Cross-Attention Text
         hidden_states, text_cross_attn_weights = self.encoder_attn_text(
             query=self.with_pos_embed(hidden_states, position_embeddings),
-            key=text_encoder_hidden_states.transpose(0, 1),
-            value=text_encoder_hidden_states.transpose(0, 1),
-            attn_mask=text_encoder_attention_mask,
+            key=text_encoder_hidden_states,
+            value=text_encoder_hidden_states,
+            key_padding_mask=text_encoder_attention_mask,
             average_attn_weights=False
         )
 
@@ -1590,12 +1590,12 @@ def __init__(self, config):
     def forward(
             self, 
             vision_hidden_state: torch.FloatTensor, 
-            text_hiddend_state: torch.FloatTensor, 
+            text_hidden_state: torch.FloatTensor, 
             text_token_mask: torch.BoolTensor
         ) -> torch.FloatTensor:
 
 
-        output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2)
+        output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
         output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
 
         # padding to max_text_len
@@ -1867,7 +1867,7 @@ def forward(
                 text_position_embedding=text_position_embedding,
                 text_self_attention_masks=text_self_attention_masks,
                 text_position_ids=text_position_ids
-            )
+            )   
 
 
             if output_attentions:
@@ -2488,7 +2488,7 @@ def forward(
             topk_coords_logits = topk_coords_logits.detach()
             reference_points = topk_coords_logits.sigmoid()
             init_reference_points = reference_points
-            if query_embeds:
+            if query_embeds is not None:
                 target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
             else:
                 target = torch.gather(
@@ -2679,6 +2679,7 @@ def forward(
         )
 
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9]
         init_reference = outputs.init_reference_points if return_dict else outputs[0]
         inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
 
@@ -2692,7 +2693,11 @@ def forward(
             else:
                 reference = inter_references[:, level - 1]
             reference = inverse_sigmoid(reference)
-            outputs_class = self.class_embed[level](hidden_states[:, level])
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=text_token_mask
+                )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
                 outputs_coord_logits = delta_bbox + reference

From 8dbed3d4d7bf24ca28d47040b4d4187848cb6381 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 22:37:58 -0300
Subject: [PATCH 060/252] Fixed attentions to be passed correctly

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2e9d7d3d0de7f5..edbab3773a4fcd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2504,7 +2504,7 @@ def forward(
             vision_encoder_hidden_states=encoder_outputs[0],
             vision_encoder_attention_mask=mask_flatten,
             text_encoder_hidden_states=encoder_outputs[1],
-            text_encoder_attention_mask=text_token_mask,
+            text_encoder_attention_mask=~text_token_mask,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
             level_start_index=level_start_index,

From a2af17210ceaf6d2ff9dcef558aa7748bad1274d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:46:17 -0300
Subject: [PATCH 061/252] Passing temperature arg when creating Sine position
 embedding

---
 .../models/grounding_dino/modeling_grounding_dino.py       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index edbab3773a4fcd..671092a234ee04 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -594,7 +594,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[in
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO
 class GroundingDINOSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
@@ -619,8 +618,8 @@ def forward(self, pixel_values, pixel_mask):
         x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
         if self.normalize:
             eps = 1e-6
-            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+            y_embed = y_embed  / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed  / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
@@ -662,7 +661,7 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True)
+        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True)
     elif config.position_embedding_type == "learned":
         position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
     else:

From 759fc1461d2c8adaa02043a5980de352e4317b9e Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:47:09 -0300
Subject: [PATCH 062/252] Removed copy comments

---
 .../models/grounding_dino/modeling_grounding_dino.py             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 671092a234ee04..000c3e1f23ff1f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -656,7 +656,6 @@ def forward(self, pixel_values, pixel_mask=None):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":

From 51963733ae9ffc9a95def9b8751d0d103a8b457f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:48:03 -0300
Subject: [PATCH 063/252] Added temperature argument for position embedding

---
 .../models/grounding_dino/configuration_grounding_dino.py    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 3a62780362d834..e321782b197810 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -150,7 +150,8 @@ class GroundingDINOConfig(PretrainedConfig):
             Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
         two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
             Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
-
+        positional_embedding_temperature (`float`, *optional*, defaults to 20):
+            The temperature for Sine Positional Embedding that is used together with vision backbone.
     Examples:
 
     ```python
@@ -227,6 +228,7 @@ def __init__(
         decoder_bbox_embed_share = True,
         two_stage_bbox_embed_share = False,
         two_stage_class_embed_share = False,
+        positional_embedding_temperature = 20,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -302,6 +304,7 @@ def __init__(
         if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
             raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
         self.two_stage_class_embed_share = two_stage_class_embed_share
+        self.positional_embedding_temperature = positional_embedding_temperature
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property

From 900cff443b26aa3bcf4bada6c4fb4263e5c1f116 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 17 Sep 2023 23:48:36 -0300
Subject: [PATCH 064/252] Fixed typo when converting weigths to GroundingDINO
 vision backbone

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 15793a0df03ae7..3fe62356b8e7d9 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -84,7 +84,7 @@ def create_rename_keys(state_dict, config):
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
             
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
-                                f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
+                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
@@ -430,6 +430,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         position_ids=text_inputs["position_ids"],
     )
 
+    print("Finished")
+
     # if pytorch_dump_folder_path is not None:
     #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     #     model.save_pretrained(pytorch_dump_folder_path)

From f23a54aef775194ecac707b1cb29c0787760f01a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:31:38 -0300
Subject: [PATCH 065/252] Final modifications on modeling

---
 .../grounding_dino/modeling_grounding_dino.py      | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 000c3e1f23ff1f..92ccdb41bab011 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1005,9 +1005,9 @@ def forward(
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output
-        residual = hidden_states
-
         hidden_states = self.layer_norm_before(hidden_states)
+
+        residual = hidden_states
         hidden_states = self.activation(self.fc1(hidden_states))
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
@@ -1426,7 +1426,7 @@ def forward(
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
             hidden_states=vision_features,
-            attention_mask=key_padding_mask,
+            attention_mask=~key_padding_mask,
             position_embeddings=vision_position_embedding,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
@@ -1517,9 +1517,10 @@ def forward(
         residual = hidden_states
 
         # Self Attention
+        q = k = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, self_attn_weights = self.self_attn(
-            query=self.with_pos_embed(hidden_states, position_embeddings),
-            key=self.with_pos_embed(hidden_states, position_embeddings),
+            query=q,
+            key=k,
             value=hidden_states,
             attn_mask=self_attn_mask,
             average_attn_weights=False
@@ -1826,9 +1827,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        #TODO check if this is necessary according to original implementation
-        vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training)
-
         reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
 
         encoder_vision_states = () if output_hidden_states else None

From 3090b2c3d48edb7510004b579626be5f583f2bb1 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:41:35 -0300
Subject: [PATCH 066/252] Removed unnecessary class

---
 .../grounding_dino/modeling_grounding_dino.py | 119 ------------------
 1 file changed, 119 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 92ccdb41bab011..94090841784322 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -841,125 +841,6 @@ def forward(
 
         return output, attention_weights
 
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
-class GroundingDINOMultiheadAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper.
-
-    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        if self.head_dim * num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
-        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, target_len, embed_dim = hidden_states.size()
-        # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
-            hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
-
-        # get queries, keys and values
-        query_states = self.q_proj(hidden_states) * self.scaling
-        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
-        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
-
-        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        source_len = key_states.size(1)
-
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        # expand attention_mask
-        if attention_mask is not None:
-            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, target_len, source_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
-                    f" {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
-            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
-            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped
-
-# Repeting some code to avoid convert nn.MultiheadAttention later
 #TODO is this an approriate way to name this?
 class GroundingDINOTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""

From 5c19e7548570e3c9cfffac850830b2b19a1406f3 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 20 Sep 2023 02:42:41 -0300
Subject: [PATCH 067/252] Fixed convert structure

---
 .../convert_grounding_dino_to_hf.py           | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 3fe62356b8e7d9..5dcaad277092ca 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -388,7 +388,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 
 @torch.no_grad()
-def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
+def convert_grounding_dino_checkpoint(
+    model_name: str, 
+    checkpoint_path: str, 
+    pytorch_dump_folder_path: str = None, 
+    push_to_hub: bool = False
+):
     #Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
@@ -420,6 +425,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     image_inputs = image_processor(image)
     text_inputs, text_token_mask = text_processor(text, config)
 
+    # Running forward
     outputs = model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
@@ -430,19 +436,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
         position_ids=text_inputs["position_ids"],
     )
 
-    print("Finished")
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
 
-    # if pytorch_dump_folder_path is not None:
-    #     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    #     model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
-    #     print(f"Saving image processor to {pytorch_dump_folder_path}")
-    #     image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # if push_to_hub:
-    #     print(f"Pushing model and image processor for {model_name} to hub")
-    #     model.push_to_hub(f"microsoft/{model_name}")
-    #     image_processor.push_to_hub(f"microsoft/{model_name}")
+    if push_to_hub:
+        print(f"Pushing model and image processor for {model_name} to hub")
+        model.push_to_hub(f"microsoft/{model_name}")
+        image_processor.push_to_hub(f"microsoft/{model_name}")
 
 
 if __name__ == "__main__":
@@ -469,4 +473,9 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path):
     )
 
     args = parser.parse_args()
-    convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path)
\ No newline at end of file
+    convert_grounding_dino_checkpoint(
+        args.model_name, 
+        args.checkpoint_path, 
+        args.pytorch_dump_folder_path, 
+        args.push_to_hub
+    )
\ No newline at end of file

From aec2f682649398617bbec5bfcaeb2ef00356032b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 24 Sep 2023 01:35:07 -0300
Subject: [PATCH 068/252] Added image processing

---
 .../image_processing_grounding_dino.py        | 967 ++++++++++++++++++
 1 file changed, 967 insertions(+)
 create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
new file mode 100644
index 00000000000000..1adf8e8e0dcd62
--- /dev/null
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -0,0 +1,967 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_coco_detection_annotations,
+    valid_images,
+)
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            size = int(round(max_size * min_original_size / max_original_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        return height, width
+
+    if width < height:
+        ow = size
+        oh = int(size * height / width)
+    else:
+        oh = size
+        ow = int(size * width / height)
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by GroundingDINO.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints[keep]
+
+    return new_target
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: Dict[str, Any],
+    orig_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`Dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`Tuple[int, int]`):
+            The original size of the input image.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class GroundingDINOImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Grounding DINO image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
+            the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            overridden by the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: Dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into Grounding DINO model.
+        """
+        target = prepare_coco_detection_annotation(
+            image, target, input_data_format=input_data_format
+        )
+
+        return target
+
+    def prepare(self, image, target):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
+            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
+            "does not return the image anymore.",
+        )
+        target = self.prepare_annotation(image, target)
+        return image, target
+    
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> Dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        max_size = None
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, max_size=max_size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_pad = self.do_pad if do_pad is None else do_pad
+
+        if do_resize is not None and size is None:
+            raise ValueError("Size and max_size must be specified if do_resize is True.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+            if annotations is not None:
+                annotations = [
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                    for annotation, image in zip(annotations, images)
+                ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            data = {"pixel_values": images}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS - TODO: add support for other frameworks
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`GroundingDINOForObjectDetection`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`GroundingDINOForObjectDetection`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if isinstance(target_sizes, List):
+            img_h = torch.Tensor([i[0] for i in target_sizes])
+            img_w = torch.Tensor([i[1] for i in target_sizes])
+        else:
+            img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results

From b7a79cd1229d379ba546c67eaf086cbd5dfdc7c5 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sun, 24 Sep 2023 01:37:59 -0300
Subject: [PATCH 069/252] make fixup partially completed

---
 docs/source/en/tasks/object_detection.md      |   2 +-
 src/transformers/__init__.py                  |  32 +-
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   6 +-
 .../models/auto/feature_extraction_auto.py    |   1 -
 .../models/auto/image_processing_auto.py      |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../configuration_grounding_dino.py           |  35 +-
 .../convert_grounding_dino_to_hf.py           | 163 +++----
 .../grounding_dino/modeling_grounding_dino.py | 405 +++++++++---------
 .../processing_grounding_dino.py              |   0
 .../tokenization_grounding_dino.py            |   0
 src/transformers/utils/dummy_pt_objects.py    |  48 +--
 utils/check_repo.py                           |   1 +
 14 files changed, 347 insertions(+), 354 deletions(-)
 create mode 100644 src/transformers/models/grounding_dino/processing_grounding_dino.py
 create mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 8ed9da455bf7ba..58ec02e80cadf7 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ff461296c5e76e..309ce05c8345e9 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -275,7 +275,6 @@
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
-    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
@@ -359,6 +358,7 @@
         "GPTSanJapaneseTokenizer",
     ],
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -1592,14 +1592,6 @@
             "DeformableDetrPreTrainedModel",
         ]
     )
-    _import_structure["models.grounding_dino"].extend(
-        [
-            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "GroundingDINOForObjectDetection",
-            "GroundingDINOModel",
-            "GroundingDINOPreTrainedModel",
-        ]
-    )
     _import_structure["models.deit"].extend(
         [
             "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1972,6 +1964,14 @@
             "GraphormerPreTrainedModel",
         ]
     )
+    _import_structure["models.grounding_dino"].extend(
+        [
+            "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GroundingDINOForObjectDetection",
+            "GroundingDINOModel",
+            "GroundingDINOPreTrainedModel",
+        ]
+    )
     _import_structure["models.groupvit"].extend(
         [
             "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4437,7 +4437,6 @@
         DecisionTransformerConfig,
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
-    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.deprecated.mctct import (
         MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4513,6 +4512,7 @@
         GPTSanJapaneseTokenizer,
     )
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
@@ -5593,12 +5593,6 @@
             DeformableDetrModel,
             DeformableDetrPreTrainedModel,
         )
-        from .models.grounding_dino import (
-            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            GroundingDINOForObjectDetection,
-            GroundingDINOModel,
-            GroundingDINOPreTrainedModel,
-        )
         from .models.deit import (
             DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeiTForImageClassification,
@@ -5902,6 +5896,12 @@
             GraphormerModel,
             GraphormerPreTrainedModel,
         )
+        from .models.grounding_dino import (
+            GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GroundingDINOForObjectDetection,
+            GroundingDINOModel,
+            GroundingDINOPreTrainedModel,
+        )
         from .models.groupvit import (
             GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             GroupViTModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index cf718e4453f79d..ec035913f29398 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -61,7 +61,6 @@
     deberta_v2,
     decision_transformer,
     deformable_detr,
-    grounding_dino,
     deit,
     deprecated,
     deta,
@@ -100,6 +99,7 @@
     gptj,
     gptsan_japanese,
     graphormer,
+    grounding_dino,
     groupvit,
     herbert,
     hubert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ca005bbc79df90..0b892f7f642642 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -73,7 +73,6 @@
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("deformable_detr", "DeformableDetrConfig"),
-        ("grounding-dino", "GroundingDINOConfig"),
         ("deit", "DeiTConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
@@ -109,6 +108,7 @@
         ("gptj", "GPTJConfig"),
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
+        ("grounding-dino", "GroundingDINOConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
@@ -288,7 +288,6 @@
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -322,6 +321,7 @@
         ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -494,7 +494,6 @@
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
         ("deformable_detr", "Deformable DETR"),
-        ("grounding-dino", "Grounding DINO"),
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
         ("deta", "DETA"),
@@ -535,6 +534,7 @@
         ("gptj", "GPT-J"),
         ("gptsan-japanese", "GPTSAN-japanese"),
         ("graphormer", "Graphormer"),
+        ("grounding-dino", "Grounding DINO"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
         ("hubert", "Hubert"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 5bc4db87f7048b..befca6a64b81b7 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -50,7 +50,6 @@
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
         ("data2vec-vision", "BeitFeatureExtractor"),
         ("deformable_detr", "DeformableDetrFeatureExtractor"),
-        ("grounding-dino", "GroundingDINOFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("dinat", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index a791255829287d..6399fe192616af 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -53,7 +53,6 @@
         ("cvt", "ConvNextImageProcessor"),
         ("data2vec-vision", "BeitImageProcessor"),
         ("deformable_detr", "DeformableDetrImageProcessor"),
-        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("deit", "DeiTImageProcessor"),
         ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
@@ -67,6 +66,7 @@
         ("focalnet", "BitImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
+        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 842af5c5272abc..45669e3ad8b4ac 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -71,7 +71,6 @@
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
-        ("grounding-dino", "GroundingDINOModel"),
         ("deit", "DeiTModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
@@ -106,6 +105,7 @@
         ("gptj", "GPTJModel"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
+        ("grounding-dino", "GroundingDINOModel"),
         ("groupvit", "GroupViTModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
@@ -630,9 +630,9 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
-        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
+        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e321782b197810..09b9c41f131964 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -26,11 +26,10 @@
 }
 
 
-
 class GroundingDINOConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate
-    a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
+    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a
+    Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
     [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
 
@@ -147,9 +146,11 @@ class GroundingDINOConfig(PretrainedConfig):
         decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
             Whether to share the bbox embedding between the decoder and the two-stage bbox generator.
         two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
-            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation.
+            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal
+            generation.
         two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
-            Whether to share the class embedding between the two-stage bbox generator and the region proposal generation.
+            Whether to share the class embedding between the two-stage bbox generator and the region proposal
+            generation.
         positional_embedding_temperature (`float`, *optional*, defaults to 20):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
     Examples:
@@ -217,18 +218,18 @@ def __init__(
         eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
-        #other parameters
-        max_text_len = 256,
-        sub_sentence_present = True,
-        text_enhancer_dropout = 0.0,
-        fusion_droppath = 0.1,
-        fusion_dropout = 0.0,
-        embedding_init_target = True,
-        query_dim = 4,
-        decoder_bbox_embed_share = True,
-        two_stage_bbox_embed_share = False,
-        two_stage_class_embed_share = False,
-        positional_embedding_temperature = 20,
+        # other parameters
+        max_text_len=256,
+        sub_sentence_present=True,
+        text_enhancer_dropout=0.0,
+        fusion_droppath=0.1,
+        fusion_dropout=0.0,
+        embedding_init_target=True,
+        query_dim=4,
+        decoder_bbox_embed_share=True,
+        two_stage_bbox_embed_share=False,
+        two_stage_class_embed_share=False,
+        positional_embedding_temperature=20,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 5dcaad277092ca..4f2f3716329ed4 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 """Convert GroundingDINO SimMIM checkpoints from the original repository.
 
-URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
+URL:
+https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
 
 import argparse
 
@@ -22,11 +23,9 @@
 import torch
 from PIL import Image
 from torchvision import transforms as T
-import torchvision.transforms.functional as F
 
-from transformers import (
-    GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer
-)
+from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection
+
 
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
@@ -66,64 +65,64 @@ def create_rename_keys(state_dict, config):
     #TODO names might change after modifing GroundingDINOModel class
     ########################################## VISION BACKBONE - START
     # patch embedding layer
-    rename_keys.append(("backbone.0.patch_embed.proj.weight", 
+    rename_keys.append(("backbone.0.patch_embed.proj.weight",
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.patch_embed.proj.bias", 
+    rename_keys.append(("backbone.0.patch_embed.proj.bias",
                         "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.patch_embed.norm.weight", 
+    rename_keys.append(("backbone.0.patch_embed.norm.weight",
                         "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.patch_embed.norm.bias", 
+    rename_keys.append(("backbone.0.patch_embed.norm.bias",
                         "model.backbone.conv_encoder.model.embeddings.norm.bias"))
 
     for layer, depth in enumerate(config.backbone_config.depths):
         for block in range(depth):
             # layernorms
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
-            
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", 
+
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
             # attention
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", 
+            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index",
             #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
             # intermidiate
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
-            
+
             # output
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
-            
+
         # downsample
         if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", 
+            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
-    
+
     for out_indice in config.backbone_config.out_indices:
         # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight",
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", 
+        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias",
                         f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
-        
+
     ########################################## VISION BACKBONE - END
 
     ########################################## ENCODER - START
@@ -182,15 +181,15 @@ def create_rename_keys(state_dict, config):
     for layer in range(config.encoder_layers):
         # deformable
         for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
         # text enhance
         for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
         # fusion layers
         for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", 
+            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}",
                                 f"model.encoder.layers.{layer}.{dest}"))
     ########################################## ENCODER - END
 
@@ -230,7 +229,7 @@ def create_rename_keys(state_dict, config):
         target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
 
         for source_name, target_name in key_mappings_decoder.items():
-            rename_keys.append((source_prefix_decoder + source_name, 
+            rename_keys.append((source_prefix_decoder + source_name,
                                target_prefix_decoder + target_name))
     ########################################## DECODER - END
 
@@ -240,7 +239,7 @@ def create_rename_keys(state_dict, config):
 
     ########################################## Additional - START
     for layer_name, params in state_dict.items():
-        #### TEXT BACKBONE 
+        #### TEXT BACKBONE
         if "bert" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
         #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
@@ -251,19 +250,19 @@ def create_rename_keys(state_dict, config):
             rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text")))
         #### DECODER REFERENCE POINT HEAD
         if "transformer.decoder.ref_point_head" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head",
                                                                "model.decoder.reference_points_head")))
         #### DECODER BBOX EMBED
         if "transformer.decoder.bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed",
                                                                "model.decoder.bbox_embed")))
         if "transformer.enc_output" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
-        
+
         if "transformer.enc_out_bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", 
+            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed",
                                                                "model.encoder_output_bbox_embed")))
-            
+
     rename_keys.append(("transformer.level_embed", "model.level_embed"))
     rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
     rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
@@ -273,10 +272,12 @@ def create_rename_keys(state_dict, config):
     # fmt: on
     return rename_keys
 
+
 def rename_key(dct, old, new):
     val = dct.pop(old)
     dct[new] = val
 
+
 # we split up the matrix of each encoder layer into queries, keys and values
 def read_in_q_k_v(state_dict, config):
     ########################################## VISION BACKBONE - START
@@ -288,14 +289,26 @@ def read_in_q_k_v(state_dict, config):
             in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
             in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
             # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size]
-
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :]
-            state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"
+            ] = in_proj_weight[:hidden_size, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"
+            ] = in_proj_bias[:hidden_size]
+
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"
+            ] = in_proj_weight[hidden_size : hidden_size * 2, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"
+            ] = in_proj_bias[hidden_size : hidden_size * 2]
+
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"
+            ] = in_proj_weight[-hidden_size:, :]
+            state_dict[
+                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"
+            ] = in_proj_bias[-hidden_size:]
     ########################################## VISION BACKBONE - END
 
 
@@ -305,12 +318,14 @@ def prepare_img():
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     return image
 
+
 def text_processor(text: str, config):
     def preprocess_caption(caption: str) -> str:
         result = caption.lower().strip()
         if result.endswith("."):
             return result
         return result + "."
+
     def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
         """Generate attention mask between each pair of special tokens
         Args:
@@ -330,9 +345,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         idxs = torch.nonzero(special_tokens_mask)
 
         # generate attention mask and positional ids
-        attention_mask = (
-            torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
-        )
+        attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
         position_ids = torch.zeros((bs, num_token), device=input_ids.device)
         cate_to_token_mask_list = [[] for _ in range(bs)]
         previous_col = 0
@@ -352,8 +365,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
             previous_col = col
 
         cate_to_token_mask_list = [
-            torch.stack(cate_to_token_mask_listi, dim=0)
-            for cate_to_token_mask_listi in cate_to_token_mask_list
+            torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list
         ]
 
         # # padding mask
@@ -361,23 +373,23 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
 
         return attention_mask, position_ids.to(torch.long)
+
     tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
     special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
     text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, special_tokens)
-    
+        tokenized, special_tokens
+    )
+
     max_text_len = config.max_text_len
     sub_sentence_present = config.sub_sentence_present
     if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[
-            :, : max_text_len, : max_text_len
-        ]
-        position_ids = position_ids[:, : max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len]
+        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+        position_ids = position_ids[:, :max_text_len]
+        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
 
     # extract text embeddings
     if sub_sentence_present:
@@ -387,14 +399,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 
+
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(
-    model_name: str, 
-    checkpoint_path: str, 
-    pytorch_dump_folder_path: str = None, 
-    push_to_hub: bool = False
+    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False
 ):
-    #Define default GroundingDINO configuation
+    # Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
@@ -403,7 +413,7 @@ def convert_grounding_dino_checkpoint(
     # Rename keys
     new_state_dict = original_state_dict.copy()
     rename_keys = create_rename_keys(original_state_dict, config)
-    
+
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)
@@ -416,17 +426,13 @@ def convert_grounding_dino_checkpoint(
     image = prepare_img()
     text = "a cat"
     image_processor = T.Compose(
-        [
-            T.Resize(size=800, max_size=1333),
-            T.ToTensor(), 
-            T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
-        ]
+        [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
     )
     image_inputs = image_processor(image)
     text_inputs, text_token_mask = text_processor(text, config)
 
     # Running forward
-    outputs = model(
+    model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
         attention_mask=text_inputs["attention_mask"],
@@ -474,8 +480,5 @@ def convert_grounding_dino_checkpoint(
 
     args = parser.parse_args()
     convert_grounding_dino_checkpoint(
-        args.model_name, 
-        args.checkpoint_path, 
-        args.pytorch_dump_folder_path, 
-        args.push_to_hub
-    )
\ No newline at end of file
+        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
+    )
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 94090841784322..69264d51b5e6b0 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -40,13 +40,11 @@
     requires_backends,
 )
 from ...modeling_outputs import (
-    BaseModelOutput, 
-    BaseModelOutputWithPoolingAndCrossAttentions, 
-    BaseModelOutputWithPastAndCrossAttentions
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...pytorch_utils import meshgrid
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_grounding_dino import GroundingDINOConfig
@@ -135,7 +133,6 @@ def backward(context, grad_output):
 ]
 
 
-
 @dataclass
 class GroundingDINODecoderOutput(ModelOutput):
     """
@@ -177,11 +174,11 @@ class GroundingDINODecoderOutput(ModelOutput):
     vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
     text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+
 @dataclass
 class GroundingDINOEncoderOutput(ModelOutput):
     """
-    Base class for outputs of the GroundingDINOEncoder. This class extends
-    BaseModelOutput, due to:
+    Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to:
     - vision and text last hidden states
     - vision and text intermediate hidden states
     - vision and text attentions
@@ -193,30 +190,31 @@ class GroundingDINOEncoderOutput(ModelOutput):
         last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the text encoder.
         hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer
-            plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer
-            plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
     """
+
     last_hidden_state_vision: torch.FloatTensor = None
     last_hidden_state_text: torch.FloatTensor = None
     hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
@@ -262,29 +260,29 @@ class GroundingDINOModelOutput(ModelOutput):
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
@@ -359,29 +357,29 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+            output of each layer plus the initial embedding outputs.
         encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each
-            layer plus the initial embedding outputs.
+            Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+            each layer plus the initial embedding outputs.
         encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in
-            the multi-scale deformable attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
+            used to compute the weighted average in the multi-scale deformable attention heads.
         encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
         encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax,
-            used to compute the weighted average in the bi-attention heads.
+            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+            attention softmax, used to compute the weighted average in the bi-attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -618,8 +616,8 @@ def forward(self, pixel_values, pixel_mask):
         x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
         if self.normalize:
             eps = 1e-6
-            y_embed = y_embed  / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed  / (x_embed[:, :, -1:] + eps) * self.scale
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
@@ -660,7 +658,9 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True)
+        position_embedding = GroundingDINOSinePositionEmbedding(
+            n_steps, config.positional_embedding_temperature, normalize=True
+        )
     elif config.position_embedding_type == "learned":
         position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
     else:
@@ -841,17 +841,19 @@ def forward(
 
         return output, attention_weights
 
-#TODO is this an approriate way to name this?
+
+# TODO is this an approriate way to name this?
 class GroundingDINOTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
+
     def __init__(self, config):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(
-            embed_dim=config.d_model, 
-            num_heads=config.encoder_attention_heads // 2, 
+            embed_dim=config.d_model,
+            num_heads=config.encoder_attention_heads // 2,
             dropout=config.text_enhancer_dropout,
             batch_first=True,
-            )
+        )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
@@ -871,18 +873,14 @@ def forward(
         hidden_states: Tensor,
         attention_masks: Optional[Tensor] = None,
         position_embeddings: Optional[Tensor] = None,
-    ):    # repeat attn mask
+    ):  # repeat attn mask
         if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
         attention_output, attention_weights = self.self_attn(
-            query=q, 
-            key=k, 
-            value=hidden_states, 
-            attn_mask=attention_masks,
-            average_attn_weights=False
+            query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output
@@ -897,16 +895,10 @@ def forward(
         hidden_states = self.layer_norm_after(hidden_states)
 
         return hidden_states, attention_weights
-    
+
+
 class GroundingDINOBiMultiHeadAttention(nn.Module):
-    def __init__(
-            self,
-            vision_dim: int,
-            text_dim: int,
-            embed_dim: int,
-            num_heads: int,
-            dropout:float = 0.1
-        ):
+    def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1):
         super().__init__()
 
         self.embed_dim = embed_dim
@@ -949,12 +941,12 @@ def _reset_parameters(self):
         self.out_text_proj.bias.data.fill_(0)
 
     def forward(
-            self, 
-            vision_features: Tensor, 
-            text_features: Tensor, 
-            vision_attention_mask: Optional[Tensor] = None, 
-            text_attention_mask: Optional[Tensor] = None
-        ):
+        self,
+        vision_features: Tensor,
+        text_features: Tensor,
+        vision_attention_mask: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+    ):
         """_summary_
 
         Args:
@@ -1000,21 +992,21 @@ def forward(
         attn_weights = attn_weights - attn_weights.max()
 
         attn_weights = torch.clamp(
-                attn_weights, min=-50000
-            )  # Do not increase -50000, data type half has quite limited range
+            attn_weights, min=-50000
+        )  # Do not increase -50000, data type half has quite limited range
         attn_weights = torch.clamp(
-                attn_weights, max=50000
-            )  # Do not increase 50000, data type half has quite limited range
+            attn_weights, max=50000
+        )  # Do not increase 50000, data type half has quite limited range
 
         attn_weights_T = attn_weights.transpose(1, 2)
         text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
-        
+
         text_attn_weights = torch.clamp(
-                text_attn_weights, min=-50000
-            )  # Do not increase -50000, data type half has quite limited range
+            text_attn_weights, min=-50000
+        )  # Do not increase -50000, data type half has quite limited range
         text_attn_weights = torch.clamp(
-                text_attn_weights, max=50000
-            )  # Do not increase 50000, data type half has quite limited range
+            text_attn_weights, max=50000
+        )  # Do not increase 50000, data type half has quite limited range
 
         # mask vison for language
         if vision_attention_mask is not None:
@@ -1027,9 +1019,7 @@ def forward(
 
         # mask language for vision
         if text_attention_mask is not None:
-            text_attention_mask = (
-                text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
-            )
+            text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
             attn_weights.masked_fill_(text_attention_mask, float("-inf"))
         vision_attn_weights = attn_weights.softmax(dim=-1)
 
@@ -1062,6 +1052,7 @@ def forward(
 
         return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
 
+
 # Copied from transformers.models.beit.modeling_beit.drop_path
 def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
@@ -1082,6 +1073,7 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
     output = input.div(keep_prob) * random_tensor
     return output
 
+
 # Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO
 class GroundingDINODropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
@@ -1095,6 +1087,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
+
+
 class GroundingDINOFusionLayer(nn.Module):
     def __init__(self, config, init_values=1e-4):
         super().__init__()
@@ -1104,11 +1098,11 @@ def __init__(self, config, init_values=1e-4):
         self.layer_norm_vision = nn.LayerNorm(config.d_model)
         self.layer_norm_text = nn.LayerNorm(config.d_model)
         self.attn = GroundingDINOBiMultiHeadAttention(
-            vision_dim=config.d_model, 
-            text_dim=config.d_model, 
-            embed_dim=config.encoder_ffn_dim // 2, 
-            num_heads=config.encoder_attention_heads // 2, 
-            dropout=config.fusion_dropout
+            vision_dim=config.d_model,
+            text_dim=config.d_model,
+            embed_dim=config.encoder_ffn_dim // 2,
+            num_heads=config.encoder_attention_heads // 2,
+            dropout=config.fusion_dropout,
         )
 
         # add layer scale for training stability
@@ -1120,17 +1114,18 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
         vision_features = self.layer_norm_vision(vision_features)
         text_features = self.layer_norm_text(text_features)
         (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
-            vision_features, 
-            text_features, 
-            vision_attention_mask=attention_mask_vision, 
-            text_attention_mask=attention_mask_text
+            vision_features,
+            text_features,
+            vision_attention_mask=attention_mask_vision,
+            text_attention_mask=attention_mask_text,
         )
         vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
         text_features = text_features + self.drop_path(self.gamma_l * delta_t)
 
         return (vision_features, vision_attn), (text_features, text_attn)
 
-#NOTE just renamed the class
+
+# NOTE just renamed the class
 class GroundingDINODeformableLayer(nn.Module):
     def __init__(self, config: GroundingDINOConfig):
         super().__init__()
@@ -1210,12 +1205,13 @@ def forward(
 
         return hidden_states, attn_weights
 
+
 def get_sine_pos_embed(
     pos_tensor: torch.Tensor,
     num_pos_feats: int = 128,
     temperature: int = 10000,
     exchange_xy: bool = True,
-    ) -> Tensor:
+) -> Tensor:
     """generate sine position embedding from a position tensor
     Args:
         pos_tensor (torch.Tensor): shape: [..., n].
@@ -1250,26 +1246,19 @@ def __init__(self, config) -> None:
         self.deformable_layer = GroundingDINODeformableLayer(config)
 
     def get_text_position_embeddings(
-            self, 
-            text_features: Tensor, 
-            text_position_embedding: Tensor, 
-            text_position_ids: Tensor
-        ) -> Tensor:
+        self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor
+    ) -> Tensor:
         bs, n_text, text_dim = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
             text_position_embedding = (
-                torch.arange(n_text, device=text_features.device)
-                .float()
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .repeat(bs, 1, 1)
+                torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1)
             )
             text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
         if text_position_ids is not None:
             text_position_embedding = get_sine_pos_embed(
                 text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
             )
-        
+
         return text_position_embedding
 
     def forward(
@@ -1284,12 +1273,10 @@ def forward(
         text_attention_mask: Optional[Tensor] = None,
         text_position_embedding: Optional[Tensor] = None,
         text_self_attention_masks: Optional[Tensor] = None,
-        text_position_ids: Optional[Tensor] = None
+        text_position_ids: Optional[Tensor] = None,
     ):
         text_position_embedding = self.get_text_position_embeddings(
-            text_features, 
-            text_position_embedding, 
-            text_position_ids
+            text_features, text_position_embedding, text_position_ids
         )
 
         (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
@@ -1302,7 +1289,7 @@ def forward(
         (text_features, text_enhanced_attn) = self.text_enhancer_layer(
             hidden_states=text_features,
             attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
-            position_embeddings=(text_position_embedding if text_position_embedding is not None else None)
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None),
         )
 
         (vision_features, vision_deformable_attn) = self.deformable_layer(
@@ -1315,8 +1302,8 @@ def forward(
         )
 
         return (
-            (vision_features, text_features), 
-            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn)
+            (vision_features, text_features),
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn),
         )
 
 
@@ -1330,7 +1317,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            batch_first=True
+            batch_first=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1342,7 +1329,7 @@ def __init__(self, config: GroundingDINOConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            batch_first=True
+            batch_first=True,
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
@@ -1400,11 +1387,7 @@ def forward(
         # Self Attention
         q = k = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, self_attn_weights = self.self_attn(
-            query=q,
-            key=k,
-            value=hidden_states,
-            attn_mask=self_attn_mask,
-            average_attn_weights=False
+            query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1419,7 +1402,7 @@ def forward(
             key=text_encoder_hidden_states,
             value=text_encoder_hidden_states,
             key_padding_mask=text_encoder_attention_mask,
-            average_attn_weights=False
+            average_attn_weights=False,
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1462,19 +1445,18 @@ def forward(
 
         return outputs
 
+
 class GroundingDINOContrastiveEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.max_text_len = config.max_text_len
 
     def forward(
-            self, 
-            vision_hidden_state: torch.FloatTensor, 
-            text_hidden_state: torch.FloatTensor, 
-            text_token_mask: torch.BoolTensor
-        ) -> torch.FloatTensor:
-
-
+        self,
+        vision_hidden_state: torch.FloatTensor,
+        text_hidden_state: torch.FloatTensor,
+        text_token_mask: torch.BoolTensor,
+    ) -> torch.FloatTensor:
         output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
         output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
 
@@ -1484,6 +1466,7 @@ def forward(
 
         return new_output
 
+
 # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
 class GroundingDINOClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
@@ -1503,30 +1486,29 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO
 class GroundingDINOPreTrainedModel(PreTrainedModel):
     config_class = GroundingDINOConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
     def _init_weights(self, module):
-        std = self.config.init_std
-
         if isinstance(module, GroundingDINOLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
             module._reset_parameters()
-        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GroundingDINOBiMultiHeadAttention):
+            module._reset_parameters()
+        elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        elif isinstance(module, GroundingDINOModel):
+            nn.init.constant_(module.input_proj_text.bias.data, 0)
+            nn.init.xavier_uniform_(module.input_proj_text.weight.data)
+            for proj in module.input_proj_vision:
+                nn.init.xavier_uniform_(proj[0].weight, gain=1)
+                nn.init.constant_(proj[0].bias, 0)
         if hasattr(module, "reference_points") and not self.config.two_stage:
             nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
             nn.init.constant_(module.reference_points.bias.data, 0.0)
@@ -1743,9 +1725,8 @@ def forward(
                 text_attention_mask=text_attention_mask,
                 text_position_embedding=text_position_embedding,
                 text_self_attention_masks=text_self_attention_masks,
-                text_position_ids=text_position_ids
-            )   
-
+                text_position_ids=text_position_ids,
+            )
 
             if output_attentions:
                 all_attn_fused_vision += (attentions[0],)
@@ -1759,9 +1740,12 @@ def forward(
 
         if not return_dict:
             enc_outputs = [
-                vision_features, text_features,
-                all_attn_fused_vision, all_attn_fused_text, 
-                all_attn_enhanced_text, all_attn_deformable
+                vision_features,
+                text_features,
+                all_attn_fused_vision,
+                all_attn_fused_text,
+                all_attn_enhanced_text,
+                all_attn_deformable,
             ]
             return tuple(v for v in enc_outputs if v is not None)
         return GroundingDINOEncoderOutput(
@@ -1772,9 +1756,10 @@ def forward(
             cross_attentions_vision=all_attn_fused_vision,
             cross_attentions_text=all_attn_fused_text,
             attentions_vision=all_attn_deformable,
-            attentions_text=all_attn_enhanced_text
+            attentions_text=all_attn_enhanced_text,
         )
 
+
 class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
@@ -1797,10 +1782,7 @@ def __init__(self, config: GroundingDINOConfig):
         self.layer_norm = nn.LayerNorm(config.d_model)
         self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
         self.reference_points_head = GroundingDINOMLPPredictionHead(
-            config.query_dim // 2 * config.d_model,
-            config.d_model,
-            config.d_model,
-            2
+            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
         )
         self.gradient_checkpointing = False
 
@@ -1826,7 +1808,7 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen
         # batch_size, num_queries, num_pos_feats
         pos_x = pos_x[:, :, None] / dim_t
         pos_y = pos_y[:, :, None] / dim_t
-        # batch_size, num_queries, num_pos_feats 
+        # batch_size, num_queries, num_pos_feats
         pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
         pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
 
@@ -1849,8 +1831,6 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen
             raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1)))
         return pos
 
-
-
     def forward(
         self,
         inputs_embeds,
@@ -1959,7 +1939,7 @@ def custom_forward(*inputs):
                     text_encoder_hidden_states=text_encoder_hidden_states,
                     text_encoder_attention_mask=text_encoder_attention_mask,
                     self_attn_mask=self_attn_mask,
-                    output_attentions=output_attentions
+                    output_attentions=output_attentions,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1992,7 +1972,6 @@ def custom_forward(*inputs):
                 if vision_encoder_hidden_states is not None:
                     all_cross_attns_vision += (layer_outputs[3],)
 
-
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate, dim=1)
         intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
@@ -2012,7 +1991,7 @@ def custom_forward(*inputs):
                     all_hidden_states,
                     all_self_attns,
                     all_cross_attns_vision,
-                    all_cross_attns_text
+                    all_cross_attns_text,
                 ]
                 if v is not None
             )
@@ -2023,7 +2002,7 @@ def custom_forward(*inputs):
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             vision_cross_attentions=all_cross_attns_vision,
-            text_cross_attentions=all_cross_attns_text
+            text_cross_attentions=all_cross_attns_text,
         )
 
 
@@ -2075,7 +2054,7 @@ def __init__(self, config: GroundingDINOConfig):
             )
 
         # Create text backbone
-        self.text_backbone = GroundingDINOTextModel(config.text_backbone_config)
+        self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config)
         self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
@@ -2199,7 +2178,7 @@ def forward(
         text_token_mask: Tensor,
         text_self_attention_masks: Tensor,
         position_ids: Tensor,
-        pixel_mask: Optional[Tensor]=None,
+        pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -2236,7 +2215,9 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"]
+        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[
+            "last_hidden_state"
+        ]
         text_features = self.input_proj_text(text_features)
 
         batch_size, num_channels, height, width = pixel_values.shape
@@ -2319,7 +2300,7 @@ def forward(
                 text_position_ids=position_ids,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict
+                return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput):
@@ -2346,9 +2327,7 @@ def forward(
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
             enc_outputs_class = self.encoder_output_class_embed(
-                object_query_embedding, 
-                encoder_outputs[1], 
-                text_token_mask
+                object_query_embedding, encoder_outputs[1], text_token_mask
             )
             # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
             delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
@@ -2389,7 +2368,7 @@ def forward(
             self_attn_mask=None,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict
+            return_dict=return_dict,
         )
 
         if not return_dict:
@@ -2422,8 +2401,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
-    top, for tasks such as COCO detection.
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
@@ -2446,13 +2425,12 @@ def __init__(self, config: GroundingDINOConfig):
         nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
         nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
 
-
         if config.decoder_bbox_embed_share:
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
             self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
-        # hack implementation for two-stage 
+        # hack implementation for two-stage
         self.model.decoder.bbox_embed = self.bbox_embed
         self.model.decoder.class_embed = self.class_embed
 
@@ -2461,8 +2439,8 @@ def __init__(self, config: GroundingDINOConfig):
                 self.model.encoder_output_bbox_embed = _bbox_embed
             else:
                 self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed)
-            
-            #TODO don't believe this is necessary since class_embed has no parameters
+
+            # TODO don't believe this is necessary since class_embed has no parameters
             if config.two_stage_class_embed_share:
                 self.model.encoder_output_class_embed = _class_embed
             else:
@@ -2490,12 +2468,12 @@ def forward(
         text_token_mask: torch.BoolTensor,
         text_self_attention_masks: torch.BoolTensor,
         position_ids: torch.LongTensor,
-        pixel_mask: Optional[torch.BoolTensor]=None,
-        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None,
-        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None,
-        output_attentions: Optional[bool]=None,
-        output_hidden_states: Optional[bool]=None,
-        return_dict: Optional[bool]=None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
@@ -2541,14 +2519,14 @@ def forward(
 
         # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
-            pixel_values=pixel_values ,
-            input_ids=input_ids ,
-            attention_mask=attention_mask ,
-            token_type_ids=token_type_ids ,
-            text_token_mask=text_token_mask ,
-            text_self_attention_masks=text_self_attention_masks ,
-            position_ids=position_ids ,
-            pixel_mask=pixel_mask ,
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            text_token_mask=text_token_mask,
+            text_self_attention_masks=text_self_attention_masks,
+            position_ids=position_ids,
+            pixel_mask=pixel_mask,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -2573,8 +2551,8 @@ def forward(
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],
                 text_hidden_state=enc_text_hidden_state,
-                text_token_mask=text_token_mask
-                )
+                text_token_mask=text_token_mask,
+            )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
                 outputs_coord_logits = delta_bbox + reference
@@ -3117,6 +3095,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
         raise ValueError("Only 3-dimensional tensors are supported")
     return NestedTensor(tensor, mask)
 
+
 # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText
 class GroundingDINOTextEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
@@ -3181,8 +3160,10 @@ def forward(
         embeddings = self.dropout(embeddings)
         return embeddings
 
+
 # Classes for Text Backbone (It's just a BERT model)
 
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
 class GroundingDINOTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3317,6 +3298,7 @@ def forward(
             outputs = outputs + (past_key_value,)
         return outputs
 
+
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText
 class GroundingDINOTextSelfOutput(nn.Module):
     def __init__(self, config):
@@ -3331,6 +3313,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText
 class GroundingDINOTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
@@ -3380,6 +3363,7 @@ def forward(
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
+
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText
 class GroundingDINOTextIntermediate(nn.Module):
     def __init__(self, config):
@@ -3395,6 +3379,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText
 class GroundingDINOTextOutput(nn.Module):
     def __init__(self, config):
@@ -3409,6 +3394,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+
 # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText
 class GroundingDINOTextLayer(nn.Module):
     def __init__(self, config):
@@ -3495,6 +3481,7 @@ def feed_forward_chunk(self, attention_output):
         layer_output = self.output(intermediate_output, attention_output)
         return layer_output
 
+
 # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText
 class GroundingDINOTextEncoder(nn.Module):
     def __init__(self, config):
@@ -3593,6 +3580,7 @@ def custom_forward(*inputs):
             cross_attentions=all_cross_attentions,
         )
 
+
 # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText
 class GroundingDINOTextPooler(nn.Module):
     def __init__(self, config):
@@ -3608,7 +3596,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         pooled_output = self.activation(pooled_output)
         return pooled_output
 
-class GroundingDINOTextModel(PreTrainedModel):
+
+class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 22f24222f67514..21ce436a8c4935 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2486,30 +2486,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class GroundingDINOForObjectDetection(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GroundingDINOModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GroundingDINOPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4005,6 +3981,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GroundingDINOForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GroundingDINOPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 85cf36eeacb1b7..95ab142fa0b7f9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -223,6 +223,7 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
+    "GroundingDINOTextPrenet",
     "InstructBlipVisionModel",
     "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",

From 685f1d66b3656087515b97185efab8017f39d71c Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 13:45:13 -0300
Subject: [PATCH 070/252] Now text_backbone_config has its own class

---
 .../configuration_grounding_dino.py           | 119 ++++++++++++++++--
 1 file changed, 111 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 09b9c41f131964..a3aa2b733d0474 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -25,6 +25,115 @@
     "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
+# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
+class GroundingDINOTextPrenetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a
+    [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or
+            [`TFGroundingDINOTextPrenetModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = GroundingDINOTextPrenetConfig()
+
+    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> model = GroundingDINOTextPrenetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "grounding-dino-text-prenet"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
 
 class GroundingDINOConfig(PretrainedConfig):
     r"""
@@ -177,7 +286,7 @@ def __init__(
         self,
         use_timm_backbone=False,
         backbone_config={"model_type": "swin"},
-        text_backbone_config="bert-base-uncased",
+        text_backbone_config=None,
         num_channels=3,
         num_queries=900,
         max_position_embeddings=1024,
@@ -187,15 +296,12 @@ def __init__(
         decoder_layers=6,
         decoder_ffn_dim=2048,
         decoder_attention_heads=8,
-        encoder_layerdrop=0.0,
         is_encoder_decoder=True,
         activation_function="relu",
         d_model=256,
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
-        init_std=0.02,
-        init_xavier_std=1.0,
         return_intermediate=True,
         auxiliary_loss=False,
         position_embedding_type="sine",
@@ -259,9 +365,6 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
         self.activation_function = activation_function
-        self.init_std = init_std
-        self.init_xavier_std = init_xavier_std
-        self.encoder_layerdrop = encoder_layerdrop
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
@@ -289,7 +392,7 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config)
+        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
         self.max_text_len = max_text_len
         self.sub_sentence_present = sub_sentence_present
         # Text Enhancer

From d6e88fcf7d9f8c9cd009c85efc62e028b92e96fb Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 13:47:56 -0300
Subject: [PATCH 071/252] Modified convert script

---
 .../convert_grounding_dino_to_hf.py           | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 4f2f3716329ed4..29ad93f70ab536 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -374,7 +374,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
         return attention_mask, position_ids.to(torch.long)
 
-    tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer
     special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
@@ -401,12 +401,21 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
 
 
 @torch.no_grad()
-def convert_grounding_dino_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False
-):
+def convert_grounding_dino_checkpoint(args):
+
+    model_name = args.model_name
+    pytorch_dump_folder_path = args.pytorch_dump_folder_path
+    push_to_hub = args.push_to_hub
+
+    checkpoint_mapping = {
+        "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
+        "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
+    }
     # Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
+    checkpoint_path = checkpoint_mapping[model_name]
+
     # Load original checkpoint
     original_state_dict = torch.load(checkpoint_path, map_location="cpu")
 
@@ -432,7 +441,7 @@ def convert_grounding_dino_checkpoint(
     text_inputs, text_token_mask = text_processor(text, config)
 
     # Running forward
-    model(
+    output = model(
         pixel_values=image_inputs.unsqueeze(0),
         input_ids=text_inputs["input_ids"],
         attention_mask=text_inputs["attention_mask"],
@@ -451,8 +460,11 @@ def convert_grounding_dino_checkpoint(
 
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"microsoft/{model_name}")
-        image_processor.push_to_hub(f"microsoft/{model_name}")
+        model.push_to_hub(f"EduardoPacheco/{model_name}")
+        #TODO push image processor to hub
+        # image_processor.push_to_hub(f"microsoft/{model_name}")
+        #TODO push tokenizer to hub
+        #TODO push processor to hub
 
 
 if __name__ == "__main__":
@@ -460,17 +472,17 @@ def convert_grounding_dino_checkpoint(
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="grounding-dino-tiny",
+        default="grounding-dino-base",
         type=str,
         choices=["grounding-dino-tiny", "grounding-dino-base"],
         help="Name of the GroundingDINO model you'd like to convert.",
     )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
+    # parser.add_argument(
+    #     "--checkpoint_path",
+    #     default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
+    #     type=str,
+    #     help="Path to the original PyTorch checkpoint (.pth file).",
+    # )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -479,6 +491,4 @@ def convert_grounding_dino_checkpoint(
     )
 
     args = parser.parse_args()
-    convert_grounding_dino_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
+    convert_grounding_dino_checkpoint(args)

From 0242e57c848e684e5b4408ce8e20a02439241a0b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 6 Oct 2023 15:01:44 -0300
Subject: [PATCH 072/252] Removed unnecessary config attribute

---
 .../configuration_grounding_dino.py           |  2 --
 .../convert_grounding_dino_to_hf.py           | 21 ++++---------------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index a3aa2b733d0474..fbd0d483b48e45 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -326,7 +326,6 @@ def __init__(
         disable_custom_kernels=False,
         # other parameters
         max_text_len=256,
-        sub_sentence_present=True,
         text_enhancer_dropout=0.0,
         fusion_droppath=0.1,
         fusion_dropout=0.0,
@@ -394,7 +393,6 @@ def __init__(
         # Text backbone
         self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
         self.max_text_len = max_text_len
-        self.sub_sentence_present = sub_sentence_present
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
         # Fusion
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 29ad93f70ab536..ed16da3f0c4617 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -347,7 +347,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         # generate attention mask and positional ids
         attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
         position_ids = torch.zeros((bs, num_token), device=input_ids.device)
-        cate_to_token_mask_list = [[] for _ in range(bs)]
         previous_col = 0
         for i in range(idxs.shape[0]):
             row, col = idxs[i]
@@ -359,18 +358,8 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
                 position_ids[row, previous_col + 1 : col + 1] = torch.arange(
                     0, col - previous_col, device=input_ids.device
                 )
-                c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
-                c2t_maski[previous_col + 1 : col] = True
-                cate_to_token_mask_list[row].append(c2t_maski)
-            previous_col = col
-
-        cate_to_token_mask_list = [
-            torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list
-        ]
 
-        # # padding mask
-        # padding_mask = tokenized['attention_mask']
-        # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+            previous_col = col
 
         return attention_mask, position_ids.to(torch.long)
 
@@ -383,7 +372,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
     )
 
     max_text_len = config.max_text_len
-    sub_sentence_present = config.sub_sentence_present
     if text_self_attention_masks.shape[1] > max_text_len:
         text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
         position_ids = position_ids[:, :max_text_len]
@@ -392,10 +380,9 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
         tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
 
     # extract text embeddings
-    if sub_sentence_present:
-        tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
-        tokenized_for_encoder["attention_mask"] = text_self_attention_masks
-        tokenized_for_encoder["position_ids"] = position_ids
+    tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+    tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+    tokenized_for_encoder["position_ids"] = position_ids
 
     return tokenized_for_encoder, tokenized.attention_mask.bool()
 

From af06c85c5e471c60b705e7e2e48522f9763a67d9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:06:12 -0300
Subject: [PATCH 073/252] Added new function to generate sub sentence mask

---
 .../grounding_dino/modeling_grounding_dino.py | 76 +++++++++++++++----
 1 file changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 69264d51b5e6b0..d75db4735ad30a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -47,7 +47,7 @@
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
-from .configuration_grounding_dino import GroundingDINOConfig
+from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig
 from .load_custom import load_cuda_kernels
 
 
@@ -1923,9 +1923,16 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
                     vision_encoder_hidden_states,
                     vision_encoder_attention_mask,
-                    None,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -2005,6 +2012,42 @@ def custom_forward(*inputs):
             text_cross_attentions=all_cross_attns_text,
         )
 
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (torch.LongTensor): input ids. Shape: [bs, num_token]
+    Returns:
+        Tuple[torch.Tensor]: attention mask between each special tokens and position_ids
+    """
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
 
 @add_start_docstrings(
     """
@@ -2173,11 +2216,8 @@ def forward(
         self,
         pixel_values: Tensor,
         input_ids: Tensor,
-        attention_mask: Tensor,
         token_type_ids: Tensor,
-        text_token_mask: Tensor,
-        text_self_attention_masks: Tensor,
-        position_ids: Tensor,
+        attention_mask: Tensor,
         pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
@@ -2214,8 +2254,19 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+        text_token_mask = attention_mask.bool() # just to avoid renaming everywhere
+
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
+
         # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[
+        text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
             "last_hidden_state"
         ]
         text_features = self.input_proj_text(text_features)
@@ -2463,11 +2514,8 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         input_ids: torch.LongTensor,
-        attention_mask: torch.BoolTensor,
+        attention_mask: torch.LongTensor,
         token_type_ids: torch.LongTensor,
-        text_token_mask: torch.BoolTensor,
-        text_self_attention_masks: torch.BoolTensor,
-        position_ids: torch.LongTensor,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
         labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
@@ -2523,9 +2571,6 @@ def forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
-            text_token_mask=text_token_mask,
-            text_self_attention_masks=text_self_attention_masks,
-            position_ids=position_ids,
             pixel_mask=pixel_mask,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,
@@ -2551,7 +2596,7 @@ def forward(
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],
                 text_hidden_state=enc_text_hidden_state,
-                text_token_mask=text_token_mask,
+                text_token_mask=attention_mask.bool(),
             )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
             if reference.shape[-1] == 4:
@@ -3609,6 +3654,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
     to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+    config_class = GroundingDINOTextPrenetConfig
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)

From 43c0ce572c21c3a14c78f7170cd211e4a880e493 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:26:14 -0300
Subject: [PATCH 074/252] Renamed parameters with gamma in the name as it's
 currently not allowed

---
 .../models/grounding_dino/modeling_grounding_dino.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d75db4735ad30a..71e7cb33fba0b9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1107,8 +1107,8 @@ def __init__(self, config, init_values=1e-4):
 
         # add layer scale for training stability
         self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
-        self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
+        self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
     def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
         vision_features = self.layer_norm_vision(vision_features)
@@ -1119,8 +1119,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at
             vision_attention_mask=attention_mask_vision,
             text_attention_mask=attention_mask_text,
         )
-        vision_features = vision_features + self.drop_path(self.gamma_v * delta_v)
-        text_features = text_features + self.drop_path(self.gamma_l * delta_t)
+        vision_features = vision_features + self.drop_path(self.vision_param * delta_v)
+        text_features = text_features + self.drop_path(self.text_param * delta_t)
 
         return (vision_features, vision_attn), (text_features, text_attn)
 

From 2bb7b70eaffcf4520013859a77fae3418985d18f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:27:04 -0300
Subject: [PATCH 075/252] Removed tokenization and image_processing scripts
 since we'll map from existing models

---
 .../image_processing_grounding_dino.py        | 967 ------------------
 .../tokenization_grounding_dino.py            |   0
 2 files changed, 967 deletions(-)
 delete mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py
 delete mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
deleted file mode 100644
index 1adf8e8e0dcd62..00000000000000
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ /dev/null
@@ -1,967 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for Deformable DETR."""
-
-import io
-import pathlib
-from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
-
-import numpy as np
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils import BaseImageProcessor, get_size_dict
-from ...image_transforms import (
-    PaddingMode,
-    center_to_corners_format,
-    corners_to_center_format,
-    id_to_rgb,
-    pad,
-    rescale,
-    resize,
-    rgb_to_id,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    IMAGENET_DEFAULT_MEAN,
-    IMAGENET_DEFAULT_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_images,
-)
-from ...utils import (
-    ExplicitEnum,
-    TensorType,
-    is_flax_available,
-    is_jax_tensor,
-    is_scipy_available,
-    is_tf_available,
-    is_tf_tensor,
-    is_torch_available,
-    is_torch_tensor,
-    is_vision_available,
-    logging,
-)
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-
-if is_vision_available():
-    import PIL
-
-if is_scipy_available():
-    import scipy.special
-    import scipy.stats
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
-def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image size and the desired output size.
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
-            The desired output size.
-        max_size (`int`, *optional*):
-            The maximum allowed output size.
-    """
-    height, width = image_size
-    if max_size is not None:
-        min_original_size = float(min((height, width)))
-        max_original_size = float(max((height, width)))
-        if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
-
-    if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
-        ow = size
-        oh = int(size * height / width)
-    else:
-        oh = size
-        ow = int(size * width / height)
-    return (oh, ow)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
-def get_resize_output_image_size(
-    input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
-    max_size: Optional[int] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
-) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image size and the desired output size. If the desired output size
-    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
-    image size is computed by keeping the aspect ratio of the input image size.
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
-            The desired output size.
-        max_size (`int`, *optional*):
-            The maximum allowed output size.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
-            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
-    """
-    image_size = get_image_size(input_image, input_data_format)
-    if isinstance(size, (list, tuple)):
-        return size
-
-    return get_size_with_aspect_ratio(image_size, size, max_size)
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
-def get_numpy_to_framework_fn(arr) -> Callable:
-    """
-    Returns a function that converts a numpy array to the framework of the input array.
-
-    Args:
-        arr (`np.ndarray`): The array to convert.
-    """
-    if isinstance(arr, np.ndarray):
-        return np.array
-    if is_tf_available() and is_tf_tensor(arr):
-        import tensorflow as tf
-
-        return tf.convert_to_tensor
-    if is_torch_available() and is_torch_tensor(arr):
-        import torch
-
-        return torch.tensor
-    if is_flax_available() and is_jax_tensor(arr):
-        import jax.numpy as jnp
-
-        return jnp.array
-    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
-
-
-# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
-def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
-    """
-    Squeezes an array, but only if the axis specified has dim 1.
-    """
-    if axis is None:
-        return arr.squeeze()
-
-    try:
-        return arr.squeeze(axis=axis)
-    except ValueError:
-        return arr
-
-
-# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
-def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
-    image_height, image_width = image_size
-    norm_annotation = {}
-    for key, value in annotation.items():
-        if key == "boxes":
-            boxes = value
-            boxes = corners_to_center_format(boxes)
-            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
-            norm_annotation[key] = boxes
-        else:
-            norm_annotation[key] = value
-    return norm_annotation
-
-
-# Copied from transformers.models.detr.image_processing_detr.max_across_indices
-def max_across_indices(values: Iterable[Any]) -> List[Any]:
-    """
-    Return the maximum value across all indices of an iterable of values.
-    """
-    return [max(values_i) for values_i in zip(*values)]
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(
-    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> List[int]:
-    """
-    Get the maximum height and width across all images in a batch.
-    """
-    if input_data_format is None:
-        input_data_format = infer_channel_dimension_format(images[0])
-
-    if input_data_format == ChannelDimension.FIRST:
-        _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_data_format == ChannelDimension.LAST:
-        max_height, max_width, _ = max_across_indices([img.shape for img in images])
-    else:
-        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
-    return (max_height, max_width)
-
-
-# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(
-    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> np.ndarray:
-    """
-    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
-
-    Args:
-        image (`np.ndarray`):
-            Image to make the pixel mask for.
-        output_size (`Tuple[int, int]`):
-            Output size of the mask.
-    """
-    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
-    mask = np.zeros(output_size, dtype=np.int64)
-    mask[:input_height, :input_width] = 1
-    return mask
-
-def prepare_coco_detection_annotation(
-    image,
-    target,
-    input_data_format: Optional[Union[ChannelDimension, str]] = None,
-):
-    """
-    Convert the target in COCO format into the format expected by GroundingDINO.
-    """
-    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
-
-    image_id = target["image_id"]
-    image_id = np.asarray([image_id], dtype=np.int64)
-
-    # Get all COCO annotations for the given image.
-    annotations = target["annotations"]
-    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
-
-    classes = [obj["category_id"] for obj in annotations]
-    classes = np.asarray(classes, dtype=np.int64)
-
-    # for conversion to coco api
-    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
-    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
-
-    boxes = [obj["bbox"] for obj in annotations]
-    # guard against no boxes via resizing
-    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
-    boxes[:, 2:] += boxes[:, :2]
-    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
-    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
-
-    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
-
-    new_target = {}
-    new_target["image_id"] = image_id
-    new_target["class_labels"] = classes[keep]
-    new_target["boxes"] = boxes[keep]
-    new_target["area"] = area[keep]
-    new_target["iscrowd"] = iscrowd[keep]
-    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
-
-    if annotations and "keypoints" in annotations[0]:
-        keypoints = [obj["keypoints"] for obj in annotations]
-        keypoints = np.asarray(keypoints, dtype=np.float32)
-        num_keypoints = keypoints.shape[0]
-        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
-
-    return new_target
-
-# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
-def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-    probs = scipy.special.softmax(logits, axis=-1)
-    labels = probs.argmax(-1, keepdims=True)
-    scores = np.take_along_axis(probs, labels, axis=-1)
-    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
-    return scores, labels
-
-# Copied from transformers.models.detr.image_processing_detr.resize_annotation
-def resize_annotation(
-    annotation: Dict[str, Any],
-    orig_size: Tuple[int, int],
-    target_size: Tuple[int, int],
-    threshold: float = 0.5,
-    resample: PILImageResampling = PILImageResampling.NEAREST,
-):
-    """
-    Resizes an annotation to a target size.
-
-    Args:
-        annotation (`Dict[str, Any]`):
-            The annotation dictionary.
-        orig_size (`Tuple[int, int]`):
-            The original size of the input image.
-        target_size (`Tuple[int, int]`):
-            The target size of the image, as returned by the preprocessing `resize` step.
-        threshold (`float`, *optional*, defaults to 0.5):
-            The threshold used to binarize the segmentation masks.
-        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
-            The resampling filter to use when resizing the masks.
-    """
-    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
-    ratio_height, ratio_width = ratios
-
-    new_annotation = {}
-    new_annotation["size"] = target_size
-
-    for key, value in annotation.items():
-        if key == "boxes":
-            boxes = value
-            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
-            new_annotation["boxes"] = scaled_boxes
-        elif key == "area":
-            area = value
-            scaled_area = area * (ratio_width * ratio_height)
-            new_annotation["area"] = scaled_area
-        elif key == "masks":
-            masks = value[:, None]
-            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
-            masks = masks.astype(np.float32)
-            masks = masks[:, 0] > threshold
-            new_annotation["masks"] = masks
-        elif key == "size":
-            new_annotation["size"] = target_size
-        else:
-            new_annotation[key] = value
-
-    return new_annotation
-
-
-class GroundingDINOImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Grounding DINO image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
-            overridden by the `do_resize` parameter in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
-            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
-            the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Resampling filter to use if resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
-            `do_rescale` parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
-            `preprocess` method.
-        do_normalize:
-            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
-            `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
-            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
-            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
-            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
-            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
-            overridden by the `do_pad` parameter in the `preprocess` method.
-    """
-
-    model_input_names = ["pixel_values", "pixel_mask"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Union[float, List[float]] = None,
-        image_std: Union[float, List[float]] = None,
-        do_pad: bool = True,
-        **kwargs,
-    ) -> None:
-        if "pad_and_return_pixel_mask" in kwargs:
-            do_pad = kwargs.pop("pad_and_return_pixel_mask")
-
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` parameter is deprecated and will be removed in v4.26. "
-                "Please specify in `size['longest_edge'] instead`.",
-            )
-            max_size = kwargs.pop("max_size")
-        else:
-            max_size = None if size is None else 1333
-
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
-        size = get_size_dict(size, max_size=max_size, default_to_square=False)
-
-        super().__init__(**kwargs)
-        self.format = format
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-        self.do_pad = do_pad
-
-    @classmethod
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
-        """
-        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
-        max_size=800)`
-        """
-        image_processor_dict = image_processor_dict.copy()
-        if "max_size" in kwargs:
-            image_processor_dict["max_size"] = kwargs.pop("max_size")
-        if "pad_and_return_pixel_mask" in kwargs:
-            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
-        return super().from_dict(image_processor_dict, **kwargs)
-
-    def prepare_annotation(
-        self,
-        image: np.ndarray,
-        target: Dict,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> Dict:
-        """
-        Prepare an annotation for feeding into Grounding DINO model.
-        """
-        target = prepare_coco_detection_annotation(
-            image, target, input_data_format=input_data_format
-        )
-
-        return target
-
-    def prepare(self, image, target):
-        logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a v4.33. "
-            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
-            "does not return the image anymore.",
-        )
-        target = self.prepare_annotation(image, target)
-        return image, target
-    
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
-        int, smaller edge of the image will be matched to this number.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
-                `height` and `width`.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use if resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` parameter is deprecated and will be removed in v4.26. "
-                "Please specify in `size['longest_edge'] instead`.",
-            )
-            max_size = kwargs.pop("max_size")
-        else:
-            max_size = None
-        size = get_size_dict(size, max_size=max_size, default_to_square=False)
-        if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(
-                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
-            )
-        elif "height" in size and "width" in size:
-            size = (size["height"], size["width"])
-        else:
-            raise ValueError(
-                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
-                f" {size.keys()}."
-            )
-        image = resize(
-            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
-        )
-        return image
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
-    def resize_annotation(
-        self,
-        annotation,
-        orig_size,
-        size,
-        resample: PILImageResampling = PILImageResampling.NEAREST,
-    ) -> Dict:
-        """
-        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
-        to this number.
-        """
-        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
-    def rescale(
-        self,
-        image: np.ndarray,
-        rescale_factor: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor. image = image * rescale_factor.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            rescale_factor (`float`):
-                The value to use for rescaling.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            input_data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
-                one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-        """
-        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
-    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
-        """
-        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
-        """
-        return normalize_annotation(annotation, image_size=image_size)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
-    def _pad_image(
-        self,
-        image: np.ndarray,
-        output_size: Tuple[int, int],
-        constant_values: Union[float, Iterable[float]] = 0,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.ndarray:
-        """
-        Pad an image with zeros to the given size.
-        """
-        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
-        output_height, output_width = output_size
-
-        pad_bottom = output_height - input_height
-        pad_right = output_width - input_width
-        padding = ((0, pad_bottom), (0, pad_right))
-        padded_image = pad(
-            image,
-            padding,
-            mode=PaddingMode.CONSTANT,
-            constant_values=constant_values,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        return padded_image
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
-    def pad(
-        self,
-        images: List[np.ndarray],
-        constant_values: Union[float, Iterable[float]] = 0,
-        return_pixel_mask: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
-        in the batch and optionally returns their corresponding pixel mask.
-
-        Args:
-            image (`np.ndarray`):
-                Image to pad.
-            constant_values (`float` or `Iterable[float]`, *optional*):
-                The value to use for the padding if `mode` is `"constant"`.
-            return_pixel_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return a pixel mask.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-
-        padded_images = [
-            self._pad_image(
-                image,
-                pad_size,
-                constant_values=constant_values,
-                data_format=data_format,
-                input_data_format=input_data_format,
-            )
-            for image in images
-        ]
-        data = {"pixel_values": padded_images}
-
-        if return_pixel_mask:
-            masks = [
-                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
-                for image in images
-            ]
-            data["pixel_mask"] = masks
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        resample=None,  # PILImageResampling
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[Union[int, float]] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_pad: Optional[bool] = None,
-        return_tensors: Optional[Union[TensorType, str]] = None,
-        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Preprocess an image or a batch of images so that it can be used by the model.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotation is for object
-                detection, the annotations should be a dictionary with the following keys:
-                - "image_id" (`int`): The image id.
-                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
-                  dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-                - "image_id" (`int`): The image id.
-                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                  An image can have no segments, in which case the list should be empty.
-                - "file_name" (`str`): The file name of the image.
-            do_resize (`bool`, *optional*, defaults to self.do_resize):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to self.size):
-                Size of the image after resizing.
-            resample (`PILImageResampling`, *optional*, defaults to self.resample):
-                Resampling filter to use when resizing the image.
-            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
-                Rescale factor to use when rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
-                Mean to use when normalizing the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
-                Standard deviation to use when normalizing the image.
-            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
-            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
-                Type of tensors to return. If `None`, will return the list of images.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        if "pad_and_return_pixel_mask" in kwargs:
-            logger.warning_once(
-                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead."
-            )
-            do_pad = kwargs.pop("pad_and_return_pixel_mask")
-
-        max_size = None
-        if "max_size" in kwargs:
-            logger.warning_once(
-                "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead."
-            )
-            size = kwargs.pop("max_size")
-
-        do_resize = self.do_resize if do_resize is None else do_resize
-        size = self.size if size is None else size
-        size = get_size_dict(size=size, max_size=max_size, default_to_square=False)
-        resample = self.resample if resample is None else resample
-        do_rescale = self.do_rescale if do_rescale is None else do_rescale
-        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
-        do_normalize = self.do_normalize if do_normalize is None else do_normalize
-        image_mean = self.image_mean if image_mean is None else image_mean
-        image_std = self.image_std if image_std is None else image_std
-        do_pad = self.do_pad if do_pad is None else do_pad
-
-        if do_resize is not None and size is None:
-            raise ValueError("Size and max_size must be specified if do_resize is True.")
-
-        if do_rescale is not None and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize is not None and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        images = make_list_of_images(images)
-        if annotations is not None and isinstance(annotations, dict):
-            annotations = [annotations]
-
-        if annotations is not None and len(images) != len(annotations):
-            raise ValueError(
-                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
-            )
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if not valid_coco_detection_annotations(annotations):
-            raise ValueError(
-                "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
-                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                "being a list of annotations in the COCO format."
-            )
-
-        # All transformations expect numpy arrays
-        images = [to_numpy_array(image) for image in images]
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
-        if annotations is not None:
-            prepared_images = []
-            prepared_annotations = []
-            for image, target in zip(images, annotations):
-                target = self.prepare_annotation(
-                    image,
-                    target,
-                    input_data_format=input_data_format,
-                )
-                prepared_images.append(image)
-                prepared_annotations.append(target)
-            images = prepared_images
-            annotations = prepared_annotations
-            del prepared_images, prepared_annotations
-
-        # transformations
-        if do_resize:
-            if annotations is not None:
-                resized_images, resized_annotations = [], []
-                for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image, input_data_format)
-                    resized_image = self.resize(
-                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
-                    )
-                    resized_annotation = self.resize_annotation(
-                        target, orig_size, get_image_size(resized_image, input_data_format)
-                    )
-                    resized_images.append(resized_image)
-                    resized_annotations.append(resized_annotation)
-                images = resized_images
-                annotations = resized_annotations
-                del resized_images, resized_annotations
-            else:
-                images = [
-                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
-                    for image in images
-                ]
-
-        if do_rescale:
-            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
-
-        if do_normalize:
-            images = [
-                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
-            ]
-            if annotations is not None:
-                annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                    for annotation, image in zip(annotations, images)
-                ]
-
-        if do_pad:
-            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
-            )
-        else:
-            images = [
-                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-                for image in images
-            ]
-            data = {"pixel_values": images}
-
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
-        if annotations is not None:
-            encoded_inputs["labels"] = [
-                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            ]
-
-        return encoded_inputs
-
-    # POSTPROCESSING METHODS - TODO: add support for other frameworks
-    def post_process(self, outputs, target_sizes):
-        """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
-
-        Args:
-            outputs ([`GroundingDINOForObjectDetection`]):
-                Raw outputs of the model.
-            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
-                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
-                original image size (before any data augmentation). For visualization, this should be the image size
-                after data augment, but before padding.
-        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
-        """
-        logger.warning_once(
-            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
-        )
-
-        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
-
-        if len(out_logits) != len(target_sizes):
-            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
-        if target_sizes.shape[1] != 2:
-            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
-
-        prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
-        boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
-
-        # and from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
-        boxes = boxes * scale_fct[:, None, :]
-
-        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
-
-        return results
-
-    def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
-    ):
-        """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
-
-        Args:
-            outputs ([`GroundingDINOForObjectDetection`]):
-                Raw outputs of the model.
-            threshold (`float`, *optional*):
-                Score threshold to keep object detection predictions.
-            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
-            top_k (`int`, *optional*, defaults to 100):
-                Keep only top k bounding boxes before filtering by thresholding.
-
-        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
-        """
-        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
-
-        if target_sizes is not None:
-            if len(out_logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
-
-        prob = out_logits.sigmoid()
-        prob = prob.view(out_logits.shape[0], -1)
-        k_value = min(top_k, prob.size(1))
-        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
-        boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
-
-        # and from relative [0, 1] to absolute [0, height] coordinates
-        if isinstance(target_sizes, List):
-            img_h = torch.Tensor([i[0] for i in target_sizes])
-            img_w = torch.Tensor([i[1] for i in target_sizes])
-        else:
-            img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
-        boxes = boxes * scale_fct[:, None, :]
-
-        results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
-
-        return results
diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000

From 98f38406a20244bc9179cea42357b3e54227c1e1 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:28:13 -0300
Subject: [PATCH 076/252] Fixed some issues with configuration

---
 .../configuration_grounding_dino.py           | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index fbd0d483b48e45..e900714852fbaa 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Grounding DINO model configuration"""
+import os
+from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -25,7 +27,7 @@
     "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
-# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
+# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
 class GroundingDINOTextPrenetConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a
@@ -134,6 +136,24 @@ def __init__(
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "grounding-dino":
+            config_dict = config_dict["text_backbone_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
 
 class GroundingDINOConfig(PretrainedConfig):
     r"""
@@ -289,7 +309,6 @@ def __init__(
         text_backbone_config=None,
         num_channels=3,
         num_queries=900,
-        max_position_embeddings=1024,
         encoder_layers=6,
         encoder_ffn_dim=2048,
         encoder_attention_heads=8,
@@ -352,7 +371,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.num_channels = num_channels
         self.num_queries = num_queries
-        self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
         self.encoder_layers = encoder_layers
@@ -391,7 +409,7 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config
+        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config)
         self.max_text_len = max_text_len
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout

From 703eeff584416cc71a8592a3146c21d78acd11e4 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:32:22 -0300
Subject: [PATCH 077/252] Just some modifications on conversion script

---
 .../convert_grounding_dino_to_hf.py           | 89 ++++---------------
 1 file changed, 18 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index ed16da3f0c4617..680c3872bf68dc 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -159,8 +159,8 @@ def create_rename_keys(state_dict, config):
         'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
     }
     fusion_key_mappings = {
-        'gamma_v': 'fusion_layer.gamma_v',
-        'gamma_l': 'fusion_layer.gamma_l',
+        'gamma_v': 'fusion_layer.vision_param',
+        'gamma_l': 'fusion_layer.text_param',
         'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
         'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
         'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
@@ -326,66 +326,11 @@ def preprocess_caption(caption: str) -> str:
             return result
         return result + "."
 
-    def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list:
-        """Generate attention mask between each pair of special tokens
-        Args:
-            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
-            special_tokens_mask (list): special tokens mask.
-        Returns:
-            torch.Tensor: attention mask between each special tokens.
-        """
-        input_ids = tokenized["input_ids"]
-        bs, num_token = input_ids.shape
-        # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
-        special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
-        for special_token in special_tokens_list:
-            special_tokens_mask |= input_ids == special_token
-
-        # idxs: each row is a list of indices of special tokens
-        idxs = torch.nonzero(special_tokens_mask)
-
-        # generate attention mask and positional ids
-        attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
-        position_ids = torch.zeros((bs, num_token), device=input_ids.device)
-        previous_col = 0
-        for i in range(idxs.shape[0]):
-            row, col = idxs[i]
-            if (col == 0) or (col == num_token - 1):
-                attention_mask[row, col, col] = True
-                position_ids[row, col] = 0
-            else:
-                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
-                position_ids[row, previous_col + 1 : col + 1] = torch.arange(
-                    0, col - previous_col, device=input_ids.device
-                )
-
-            previous_col = col
-
-        return attention_mask, position_ids.to(torch.long)
-
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer
-    special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
-    text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, special_tokens
-    )
-
-    max_text_len = config.max_text_len
-    if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-        position_ids = position_ids[:, :max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
-
-    # extract text embeddings
-    tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
-    tokenized_for_encoder["attention_mask"] = text_self_attention_masks
-    tokenized_for_encoder["position_ids"] = position_ids
-
-    return tokenized_for_encoder, tokenized.attention_mask.bool()
 
+    return tokenized
 
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(args):
@@ -415,7 +360,8 @@ def convert_grounding_dino_checkpoint(args):
     read_in_q_k_v(new_state_dict, config)
 
     # Load HF implementation with default config and converted state dict
-    model = GroundingDINOForObjectDetection(config).eval()
+    model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval()
+    # model = GroundingDINOForObjectDetection(config=config).eval()
     model.load_state_dict(new_state_dict, strict=False)
 
     # Load and process test image
@@ -425,19 +371,24 @@ def convert_grounding_dino_checkpoint(args):
         [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
     )
     image_inputs = image_processor(image)
-    text_inputs, text_token_mask = text_processor(text, config)
+    text_inputs = text_processor(text, config)
 
     # Running forward
     output = model(
         pixel_values=image_inputs.unsqueeze(0),
-        input_ids=text_inputs["input_ids"],
-        attention_mask=text_inputs["attention_mask"],
-        token_type_ids=text_inputs["token_type_ids"],
-        text_token_mask=text_token_mask,
-        text_self_attention_masks=text_inputs["attention_mask"],
-        position_ids=text_inputs["position_ids"],
+        **text_inputs
     )
 
+    # output.pred_boxes[:, :3, :]
+    # tensor([[[0.7674, 0.4136, 0.4572, 0.7305],
+    #      [0.2566, 0.5463, 0.4760, 0.8777],
+    #      [0.2585, 0.5442, 0.4640, 0.8683]]])
+    #
+    # output.logits[:, :3, :4]
+    # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374],
+    #      [-4.9652, -0.3719, -0.3950, -4.2315],
+    #      [-5.9599, -3.3765, -3.3104, -5.9752]]])
+
     if pytorch_dump_folder_path is not None:
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
@@ -448,10 +399,6 @@ def convert_grounding_dino_checkpoint(args):
     if push_to_hub:
         print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"EduardoPacheco/{model_name}")
-        #TODO push image processor to hub
-        # image_processor.push_to_hub(f"microsoft/{model_name}")
-        #TODO push tokenizer to hub
-        #TODO push processor to hub
 
 
 if __name__ == "__main__":
@@ -459,7 +406,7 @@ def convert_grounding_dino_checkpoint(args):
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="grounding-dino-base",
+        default="grounding-dino-tiny",
         type=str,
         choices=["grounding-dino-tiny", "grounding-dino-base"],
         help="Name of the GroundingDINO model you'd like to convert.",

From c1c1467a80ea9bbbf689c5cc55221da6f0bdb51a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 13 Oct 2023 17:33:52 -0300
Subject: [PATCH 078/252] Other modifications

---
 src/transformers/__init__.py                       | 4 ++--
 src/transformers/models/grounding_dino/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 309ce05c8345e9..6ceff48c7c5cbc 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -358,7 +358,7 @@
         "GPTSanJapaneseTokenizer",
     ],
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
-    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -4512,7 +4512,7 @@
         GPTSanJapaneseTokenizer,
     )
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
-    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index e3767e017d1023..df2b0d907f1b65 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -18,7 +18,7 @@
 
 
 _import_structure = {
-    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"],
+    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
 }
 
 try:
@@ -36,7 +36,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig
+    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
 
     try:
         if not is_torch_available():

From bfb8829f1f7e205760ede9f621b708a3f0e07943 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 16:36:51 +0200
Subject: [PATCH 079/252] Fix style

---
 src/transformers/__init__.py                         | 12 ++++++++++--
 src/transformers/models/grounding_dino/__init__.py   | 12 ++++++++++--
 .../grounding_dino/configuration_grounding_dino.py   |  9 +++++++--
 .../grounding_dino/convert_grounding_dino_to_hf.py   | 11 +++++------
 .../models/grounding_dino/modeling_grounding_dino.py |  8 ++++++--
 5 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6ceff48c7c5cbc..aaab9c8fff2c21 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -358,7 +358,11 @@
         "GPTSanJapaneseTokenizer",
     ],
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
-    "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
+    "models.grounding_dino": [
+        "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GroundingDINOConfig",
+        "GroundingDINOTextPrenetConfig",
+    ],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -4512,7 +4516,11 @@
         GPTSanJapaneseTokenizer,
     )
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
-    from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
+    from .models.grounding_dino import (
+        GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GroundingDINOConfig,
+        GroundingDINOTextPrenetConfig,
+    )
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index df2b0d907f1b65..8ed227086ac3ae 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -18,7 +18,11 @@
 
 
 _import_structure = {
-    "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"],
+    "configuration_grounding_dino": [
+        "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GroundingDINOConfig",
+        "GroundingDINOTextPrenetConfig",
+    ],
 }
 
 try:
@@ -36,7 +40,11 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig
+    from .configuration_grounding_dino import (
+        GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GroundingDINOConfig,
+        GroundingDINOTextPrenetConfig,
+    )
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e900714852fbaa..0de76985e82338 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -18,7 +18,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING, AutoConfig
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -27,6 +27,7 @@
     "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
 }
 
+
 # Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
 class GroundingDINOTextPrenetConfig(PretrainedConfig):
     r"""
@@ -409,7 +410,11 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config)
+        self.text_backbone_config = (
+            GroundingDINOTextPrenetConfig()
+            if text_backbone_config is None
+            else GroundingDINOTextPrenetConfig(**text_backbone_config)
+        )
         self.max_text_len = max_text_len
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 680c3872bf68dc..d58bebd09490cc 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -326,15 +326,17 @@ def preprocess_caption(caption: str) -> str:
             return result
         return result + "."
 
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        "bert-base-uncased"
+    )  # Using just for now since I didn't finish the tokenizer
     text = preprocess_caption(text)
     tokenized = tokenizer([text], padding="longest", return_tensors="pt")
 
     return tokenized
 
+
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(args):
-
     model_name = args.model_name
     pytorch_dump_folder_path = args.pytorch_dump_folder_path
     push_to_hub = args.push_to_hub
@@ -374,10 +376,7 @@ def convert_grounding_dino_checkpoint(args):
     text_inputs = text_processor(text, config)
 
     # Running forward
-    output = model(
-        pixel_values=image_inputs.unsqueeze(0),
-        **text_inputs
-    )
+    model(pixel_values=image_inputs.unsqueeze(0), **text_inputs)
 
     # output.pred_boxes[:, :3, :]
     # tensor([[[0.7674, 0.4136, 0.4572, 0.7305],
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 71e7cb33fba0b9..104ef8c3d20e92 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1932,7 +1932,7 @@ def custom_forward(*inputs):
                     text_encoder_hidden_states,
                     text_encoder_attention_mask,
                     self_attn_mask,
-                    None
+                    None,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -2012,7 +2012,10 @@ def custom_forward(*inputs):
             text_cross_attentions=all_cross_attns_text,
         )
 
+
 SPECIAL_TOKENS = [101, 102, 1012, 1029]
+
+
 def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
     """Generate attention mask between each pair of special tokens and positional ids.
     Args:
@@ -2255,7 +2258,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
-        text_token_mask = attention_mask.bool() # just to avoid renaming everywhere
+        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
 
         max_text_len = self.config.max_text_len
         if text_self_attention_masks.shape[1] > max_text_len:
@@ -3654,6 +3657,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
     to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+
     config_class = GroundingDINOTextPrenetConfig
 
     def __init__(self, config, add_pooling_layer=True):

From 587589ee8fa07e4b36d85aaa312b1210ec655935 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 16:58:06 +0200
Subject: [PATCH 080/252] Improve fixup

---
 README.md                                     |  2 +-
 README_es.md                                  |  2 +-
 README_hd.md                                  |  2 +-
 README_ja.md                                  |  2 +-
 README_ko.md                                  |  2 +-
 README_zh-hans.md                             |  2 +-
 README_zh-hant.md                             |  2 +-
 docs/source/en/index.md                       |  1 +
 docs/source/en/model_doc/grounding-dino.md    | 12 +--
 .../models/auto/image_processing_auto.py      |  2 +-
 .../configuration_grounding_dino.py           | 86 +++++++------------
 .../test_modeling_grounding_dino.py           |  5 ++
 12 files changed, 50 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index 3311a4785b54d7..4774e04faedecb 100644
--- a/README.md
+++ b/README.md
@@ -375,7 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_es.md b/README_es.md
index e5497cdd9cd8f6..3d1f6cc0099906 100644
--- a/README_es.md
+++ b/README_es.md
@@ -350,7 +350,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_hd.md b/README_hd.md
index 7e85a8c53d1713..381792c11f76da 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -322,7 +322,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others से) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. द्वाराअनुसंधान पत्र [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) के साथ जारी किया गया
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 8f347bdd79264e..d2c660fd257734 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -384,7 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others から) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. から公開された研究論文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
diff --git a/README_ko.md b/README_ko.md
index 31418f42b8a9ff..aa3c524f25f075 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -299,7 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others 에서 제공)은 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.의 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 107ed00f3de87f..9bb392e266b57f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -323,7 +323,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (来自 Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) 伴随论文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 由 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang 发布。
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index a633740b292821..e0878fc3bc774c 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -335,7 +335,7 @@ conda install -c huggingface transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
-1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index a1fbc63c7cc4e0..41627a7a81392f 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -148,6 +148,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
 |               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
+|                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 161a90609174b3..05b5f84d698347 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -18,20 +18,22 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Grounding DINO model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP.*
 
 Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
 
+## GroundingDINOTextPrenetConfig
+
+[[autodoc]] GroundingDINOTextPrenetConfig
 
 ## GroundingDINOConfig
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 6399fe192616af..cf33369ef5492d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -66,7 +66,7 @@
         ("focalnet", "BitImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
-        ("grounding-dino", "GroundingDINOImageProcessor"),
+        ("grounding-dino", "DeformableDetrImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0de76985e82338..8ba34f727243b0 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -65,23 +65,19 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
         type_vocab_size (`int`, *optional*, defaults to 2):
             The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or
             [`TFGroundingDINOTextPrenetModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The index of the padding token in the token vocabulary.
         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
             Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
             positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        is_decoder (`bool`, *optional*, defaults to `False`):
-            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
 
     Examples:
 
@@ -111,12 +107,10 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=512,
         type_vocab_size=2,
-        initializer_range=0.02,
         layer_norm_eps=1e-12,
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -131,11 +125,9 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -167,55 +159,50 @@ class GroundingDINOConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
             API.
-        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `{'model_type': 'swin'}`):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
         text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`):
             The configuration of the text backbone model. Should be a bert-like config.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        num_queries (`int`, *optional*, defaults to 300):
+        num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
             `two_stage_num_proposals` instead.
-        d_model (`int`, *optional*, defaults to 256):
-            Dimension of the layers.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 6):
-            Number of decoder layers.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         encoder_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
         decoder_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
         activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        init_xavier_std (`float`, *optional*, defaults to 1):
-            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
         auxiliary_loss (`bool`, *optional*, defaults to `False`):
             Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
-        backbone (`str`, *optional*, defaults to `"resnet50"`):
+        backbone (`str`, *optional*, defaults to `"swin"`):
             Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
             backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
@@ -224,36 +211,30 @@ class GroundingDINOConfig(PretrainedConfig):
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
-        class_cost (`float`, *optional*, defaults to 1):
-            Relative weight of the classification error in the Hungarian matching cost.
-        bbox_cost (`float`, *optional*, defaults to 5):
-            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
-        giou_cost (`float`, *optional*, defaults to 2):
-            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
-        mask_loss_coefficient (`float`, *optional*, defaults to 1):
-            Relative weight of the Focal loss in the panoptic segmentation loss.
-        dice_loss_coefficient (`float`, *optional*, defaults to 1):
-            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
-        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
-            Relative weight of the L1 bounding box loss in the object detection loss.
-        giou_loss_coefficient (`float`, *optional*, defaults to 2):
-            Relative weight of the generalized IoU loss in the object detection loss.
-        eos_coefficient (`float`, *optional*, defaults to 0.1):
-            Relative classification weight of the 'no-object' class in the object detection loss.
         num_feature_levels (`int`, *optional*, defaults to 4):
             The number of input feature levels.
         encoder_n_points (`int`, *optional*, defaults to 4):
             The number of sampled keys in each feature level for each attention head in the encoder.
         decoder_n_points (`int`, *optional*, defaults to 4):
             The number of sampled keys in each feature level for each attention head in the decoder.
-        two_stage (`bool`, *optional*, defaults to `False`):
+        two_stage (`bool`, *optional*, defaults to `True`):
             Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
             Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
-        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+        two_stage_num_proposals (`int`, *optional*, defaults to 900):
             The number of region proposals to be generated, in case `two_stage` is set to `True`.
-        with_box_refine (`bool`, *optional*, defaults to `False`):
+        with_box_refine (`bool`, *optional*, defaults to `True`):
             Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
             based on the predictions from the previous layer.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
         focal_alpha (`float`, *optional*, defaults to 0.25):
             Alpha parameter in the focal loss.
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
@@ -261,8 +242,6 @@ class GroundingDINOConfig(PretrainedConfig):
             kernels are not supported by PyTorch ONNX export.
         max_text_len (`int`, *optional*, defaults to 256):
             The maximum length of the text input.
-        sub_sentence_present (`bool`, *optional*, defaults to `True`):
-            Whether to use sub-sentence present in the text input.
         text_enhancer_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the text enhancer.
         fusion_droppath (`float`, *optional*, defaults to 0.1):
@@ -322,7 +301,6 @@ def __init__(
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
-        return_intermediate=True,
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="swin",
@@ -337,11 +315,8 @@ def __init__(
         class_cost=1,
         bbox_cost=5,
         giou_cost=2,
-        mask_loss_coefficient=1,
-        dice_loss_coefficient=1,
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
-        eos_coefficient=0.1,
         focal_alpha=0.25,
         disable_custom_kernels=False,
         # other parameters
@@ -402,11 +377,8 @@ def __init__(
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
         # Loss coefficients
-        self.mask_loss_coefficient = mask_loss_coefficient
-        self.dice_loss_coefficient = dice_loss_coefficient
         self.bbox_loss_coefficient = bbox_loss_coefficient
         self.giou_loss_coefficient = giou_loss_coefficient
-        self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 3007eef6399916..b4c35ba7bda906 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -193,6 +193,11 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
+    pipeline_model_mapping = (
+        {"feature-extraction": GroundingDINOModel, "object-detection": GroundingDINOForObjectDetection}
+        if is_torch_available()
+        else {}
+    )
 
     # special case for head models
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

From f68361110aea1e6d657dfe2a7bf6a6114eeb9e17 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 17:12:14 +0200
Subject: [PATCH 081/252] Improve conversion script

---
 .../convert_grounding_dino_to_hf.py           | 26 ++++++++-----------
 .../grounding_dino/modeling_grounding_dino.py |  2 +-
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index d58bebd09490cc..1f5fbae366cd5b 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """Convert GroundingDINO SimMIM checkpoints from the original repository.
 
-URL:
-https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models"""
+URL: https://github.com/IDEA-Research/GroundingDINO"""
 
 import argparse
 
@@ -342,16 +341,16 @@ def convert_grounding_dino_checkpoint(args):
     push_to_hub = args.push_to_hub
 
     checkpoint_mapping = {
-        "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth",
-        "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
+        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth",
+        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth",
     }
     # Define default GroundingDINO configuation
     config = get_grounding_dino_config(model_name)
 
-    checkpoint_path = checkpoint_mapping[model_name]
-
     # Load original checkpoint
-    original_state_dict = torch.load(checkpoint_path, map_location="cpu")
+    checkpoint_url = checkpoint_mapping[model_name]
+    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
+    original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
 
     # Rename keys
     new_state_dict = original_state_dict.copy()
@@ -362,9 +361,12 @@ def convert_grounding_dino_checkpoint(args):
     read_in_q_k_v(new_state_dict, config)
 
     # Load HF implementation with default config and converted state dict
-    model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval()
+    model = GroundingDINOForObjectDetection(config)
+    model.eval()
     # model = GroundingDINOForObjectDetection(config=config).eval()
-    model.load_state_dict(new_state_dict, strict=False)
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
 
     # Load and process test image
     image = prepare_img()
@@ -410,12 +412,6 @@ def convert_grounding_dino_checkpoint(args):
         choices=["grounding-dino-tiny", "grounding-dino-base"],
         help="Name of the GroundingDINO model you'd like to convert.",
     )
-    # parser.add_argument(
-    #     "--checkpoint_path",
-    #     default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth",
-    #     type=str,
-    #     help="Path to the original PyTorch checkpoint (.pth file).",
-    # )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 104ef8c3d20e92..b4e99fa6a776fc 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 3a0c7420ec1397dd4ea7da6c0eb71d01f316c832 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 19:29:42 +0200
Subject: [PATCH 082/252] Improve conversion script

---
 .../convert_grounding_dino_to_hf.py           | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 1f5fbae366cd5b..fa0455ba94eb39 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -360,10 +360,9 @@ def convert_grounding_dino_checkpoint(args):
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v(new_state_dict, config)
 
-    # Load HF implementation with default config and converted state dict
+    # Load HF model
     model = GroundingDINOForObjectDetection(config)
     model.eval()
-    # model = GroundingDINOForObjectDetection(config=config).eval()
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
     print("Missing keys:", missing_keys)
     print("Unexpected keys:", unexpected_keys)
@@ -378,23 +377,23 @@ def convert_grounding_dino_checkpoint(args):
     text_inputs = text_processor(text, config)
 
     # Running forward
-    model(pixel_values=image_inputs.unsqueeze(0), **text_inputs)
-
-    # output.pred_boxes[:, :3, :]
-    # tensor([[[0.7674, 0.4136, 0.4572, 0.7305],
-    #      [0.2566, 0.5463, 0.4760, 0.8777],
-    #      [0.2585, 0.5442, 0.4640, 0.8683]]])
-    #
-    # output.logits[:, :3, :4]
-    # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374],
-    #      [-4.9652, -0.3719, -0.3950, -4.2315],
-    #      [-5.9599, -3.3765, -3.3104, -5.9752]]])
+    with torch.no_grad():
+        outputs = model(pixel_values=image_inputs.unsqueeze(0), **text_inputs)
+
+    print("First values of logits:", outputs.logits[0, :3, :3])
+    print("First values of boxes:", outputs.pred_boxes[0, :3, :3])
+
+    # verify outputs
+    expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]])
+    expected_logits = torch.tensor(
+        [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
+    )
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
         image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:

From 6115547d4fcd1e4f532f503f6847b709f31507ab Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 21:04:07 +0200
Subject: [PATCH 083/252] Add GroundingDINOProcessor

---
 docs/source/en/model_doc/grounding-dino.md    |   4 +
 src/transformers/__init__.py                  |   2 +
 .../models/grounding_dino/__init__.py         |   2 +
 .../convert_grounding_dino_to_hf.py           |  45 ++++--
 .../processing_grounding_dino.py              | 151 ++++++++++++++++++
 5 files changed, 189 insertions(+), 15 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 05b5f84d698347..03c3549c32cb5f 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -31,6 +31,10 @@ Tips:
 This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
 
+## GroundingDINOProcessor
+
+[[autodoc]] GroundingDINOProcessor
+
 ## GroundingDINOTextPrenetConfig
 
 [[autodoc]] GroundingDINOTextPrenetConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aaab9c8fff2c21..c73345163a37b9 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -361,6 +361,7 @@
     "models.grounding_dino": [
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroundingDINOConfig",
+        "GroundingDINOProcessor",
         "GroundingDINOTextPrenetConfig",
     ],
     "models.groupvit": [
@@ -4519,6 +4520,7 @@
     from .models.grounding_dino import (
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroundingDINOConfig,
+        GroundingDINOProcessor,
         GroundingDINOTextPrenetConfig,
     )
     from .models.groupvit import (
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 8ed227086ac3ae..229666382564b8 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -23,6 +23,7 @@
         "GroundingDINOConfig",
         "GroundingDINOTextPrenetConfig",
     ],
+    "processing_grounding_dino": ["GroundingDINOProcessor"],
 }
 
 try:
@@ -45,6 +46,7 @@
         GroundingDINOConfig,
         GroundingDINOTextPrenetConfig,
     )
+    from .processing_grounding_dino import GroundingDINOProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index fa0455ba94eb39..042771e2a37280 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -23,7 +23,13 @@
 from PIL import Image
 from torchvision import transforms as T
 
-from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection
+from transformers import (
+    AutoTokenizer,
+    DeformableDetrImageProcessor,
+    GroundingDINOConfig,
+    GroundingDINOForObjectDetection,
+    GroundingDINOProcessor,
+)
 
 
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
@@ -318,20 +324,21 @@ def prepare_img():
     return image
 
 
-def text_processor(text: str, config):
-    def preprocess_caption(caption: str) -> str:
-        result = caption.lower().strip()
-        if result.endswith("."):
-            return result
-        return result + "."
+def preprocess_caption(caption: str) -> str:
+    result = caption.lower().strip()
+    if result.endswith("."):
+        return result
+    return result + "."
 
+
+def text_processor(text: str):
     tokenizer = AutoTokenizer.from_pretrained(
         "bert-base-uncased"
     )  # Using just for now since I didn't finish the tokenizer
     text = preprocess_caption(text)
-    tokenized = tokenizer([text], padding="longest", return_tensors="pt")
+    original_text_inputs = tokenizer([text], padding="longest", return_tensors="pt")
 
-    return tokenized
+    return original_text_inputs
 
 
 @torch.no_grad()
@@ -369,16 +376,23 @@ def convert_grounding_dino_checkpoint(args):
 
     # Load and process test image
     image = prepare_img()
+    transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+    original_pixel_values = transforms(image).unsqueeze(0)
     text = "a cat"
-    image_processor = T.Compose(
-        [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
-    )
-    image_inputs = image_processor(image)
-    text_inputs = text_processor(text, config)
+    text_inputs = text_processor(text)
+
+    image_processor = DeformableDetrImageProcessor()
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+    inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt")
+
+    assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4)
+    assert torch.allclose(text_inputs["input_ids"], inputs.input_ids, atol=1e-4)
 
     # Running forward
     with torch.no_grad():
-        outputs = model(pixel_values=image_inputs.unsqueeze(0), **text_inputs)
+        outputs = model(pixel_values=original_pixel_values, **text_inputs)
 
     print("First values of logits:", outputs.logits[0, :3, :3])
     print("First values of boxes:", outputs.pred_boxes[0, :3, :3])
@@ -390,6 +404,7 @@ def convert_grounding_dino_checkpoint(args):
     )
     assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
     assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index e69de29bb2d1d6..5bc1feaa2d510c 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Grounding DINO.
+"""
+
+from typing import List, Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class GroundingDINOProcessor(ProcessorMixin):
+    r"""
+    Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
+    single processor.
+
+    [`GroundingDINOProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and
+    [`AutoTokenizer`]. See the docstring of [`~GroundingDINOProcessor.__call__`] and [`~GroundingDINOProcessor.decode`]
+    for more information.
+
+    Args:
+        image_processor (`DeformableDetrImageProcessor`):
+            An instance of [`DeformableDetrImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "DeformableDetrImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`DeformableDetrImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        # Get only text
+        if images is None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            return text_encoding
+
+        # add pixel_values
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        else:
+            text_encoding = None
+
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+
+        return encoding_image_processor
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

From cc1788f351f3eb0c47643c217f5e55ac8adc1dc7 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 21:47:32 +0200
Subject: [PATCH 084/252] More improvements

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 042771e2a37280..e5505b50297186 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -407,13 +407,14 @@ def convert_grounding_dino_checkpoint(args):
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and image processor for {model_name} to hub")
+        print(f"Pushing model and processor for {model_name} to hub")
         model.push_to_hub(f"EduardoPacheco/{model_name}")
+        processor.push_to_hub(f"EduardoPacheco/{model_name}")
 
 
 if __name__ == "__main__":

From a6dea4ada4796b2716bdfcf0b32cb1df97c5a959 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Oct 2023 21:55:01 +0200
Subject: [PATCH 085/252] Return token type ids

---
 .../convert_grounding_dino_to_hf.py              | 16 ++--------------
 .../grounding_dino/processing_grounding_dino.py  |  2 +-
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index e5505b50297186..a0d0d205454217 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -331,16 +331,6 @@ def preprocess_caption(caption: str) -> str:
     return result + "."
 
 
-def text_processor(text: str):
-    tokenizer = AutoTokenizer.from_pretrained(
-        "bert-base-uncased"
-    )  # Using just for now since I didn't finish the tokenizer
-    text = preprocess_caption(text)
-    original_text_inputs = tokenizer([text], padding="longest", return_tensors="pt")
-
-    return original_text_inputs
-
-
 @torch.no_grad()
 def convert_grounding_dino_checkpoint(args):
     model_name = args.model_name
@@ -378,21 +368,19 @@ def convert_grounding_dino_checkpoint(args):
     image = prepare_img()
     transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
     original_pixel_values = transforms(image).unsqueeze(0)
-    text = "a cat"
-    text_inputs = text_processor(text)
 
     image_processor = DeformableDetrImageProcessor()
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
+    text = "a cat"
     inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt")
 
     assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4)
-    assert torch.allclose(text_inputs["input_ids"], inputs.input_ids, atol=1e-4)
 
     # Running forward
     with torch.no_grad():
-        outputs = model(pixel_values=original_pixel_values, **text_inputs)
+        outputs = model(**inputs)
 
     print("First values of logits:", outputs.logits[0, :3, :3])
     print("First values of boxes:", outputs.pred_boxes[0, :3, :3])
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 5bc1feaa2d510c..10fd6e9834a9c3 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -60,7 +60,7 @@ def __call__(
         return_overflowing_tokens: bool = False,
         return_special_tokens_mask: bool = False,
         return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = False,
+        return_token_type_ids: bool = True,
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,

From ae6e110142c1b679d8ed58f4066613562948980a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 14 Oct 2023 16:58:23 -0300
Subject: [PATCH 086/252] something

---
 .../models/auto/image_processing_auto.py      |   2 +-
 .../models/auto/tokenization_auto.py          |   1 +
 .../test_modeling_grounding_dino.py           | 109 +++++++++++-------
 3 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index d6d722b3e0842b..7962ccc11c5ba7 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -66,7 +66,7 @@
         ("focalnet", "BitImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
-        ("grounding-dino", "GroundingDINOImageProcessor"),
+        ("grounding-dino", "DeformableDetrImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a5b167183ce913..5b4f69490d6146 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -165,6 +165,7 @@
             ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
             ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
+            ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 3007eef6399916..4058ab073fda2b 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,7 +20,7 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
     require_timm,
@@ -53,59 +53,57 @@ class GroundingDINOModelTester:
     def __init__(
         self,
         parent,
+        image_size=196,
         batch_size=8,
-        is_training=True,
-        use_labels=True,
+        is_training=False,
+        use_labels=False,
         hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
+        hidden_act="relu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         num_queries=12,
         num_channels=3,
-        image_size=196,
         n_targets=8,
-        num_labels=91,
         num_feature_levels=4,
-        encoder_n_points=2,
-        decoder_n_points=6,
+        intermediate_size=32
     ):
         self.parent = parent
         self.batch_size = batch_size
         self.is_training = is_training
         self.use_labels = use_labels
+        self.image_size = image_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.num_queries = num_queries
         self.num_channels = num_channels
-        self.image_size = image_size
         self.n_targets = n_targets
-        self.num_labels = num_labels
         self.num_feature_levels = num_feature_levels
-        self.encoder_n_points = encoder_n_points
-        self.decoder_n_points = decoder_n_points
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
-            math.ceil(self.image_size / 8) ** 2
-            + math.ceil(self.image_size / 16) ** 2
-            + math.ceil(self.image_size / 32) ** 2
-            + math.ceil(self.image_size / 64) ** 2
-        )
-        self.decoder_seq_length = self.num_queries
+        self.intermediate_size = intermediate_size
+
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
         pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
 
+        input_ids = torch.Tensor([[101, 1037, 4937, 1012, 102]]).long()
+        text_token_mask = torch.ones_like(input_ids).bool()
+        text_self_attention_masks = torch.Tensor([
+                [[ True, False, False, False, False],
+                [False,  True,  True,  True, False],
+                [False,  True,  True,  True, False],
+                [False,  True,  True,  True, False],
+                [False, False, False, False,  True]]
+            ]
+         ).bool()
+        token_type_ids = torch.zeros_like(input_ids).long()
+        position_ids = torch.Tensor([[0, 0, 1, 2, 0]]).long()
+
         labels = None
         if self.use_labels:
             # labels is a list of Dict (each Dict being the labels for a given example in the batch)
@@ -120,16 +118,16 @@ def prepare_config_and_inputs(self):
                 labels.append(target)
 
         config = self.get_config()
-        return config, pixel_values, pixel_mask, labels
+        return config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, text_self_attention_masks.copy(), token_type_ids, position_ids, labels
 
     def get_config(self):
-        resnet_config = ResNetConfig(
+        swin_config = SwinConfig(
             num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
+            hidden_size=128,
+            embed_dim=96,
+            image_size=self.image_size,
+            window_size=7,
             depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
@@ -149,36 +147,67 @@ def get_config(self):
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
             use_timm_backbone=False,
-            backbone_config=resnet_config,
+            backbone_config=swin_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask,
+                       "input_ids": input_ids, "text_token_mask": text_token_mask, 
+                       "text_self_attention_masks": text_self_attention_masks, "token_type_ids": token_type_ids, 
+                       "position_ids": position_ids, "attention_mask": attention_mask
+        }
         return config, inputs_dict
 
-    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels):
+    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels):
         model = GroundingDINOModel(config=config)
         model.to(torch_device)
         model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, 
+                       input_ids=input_ids, text_token_mask=text_token_mask, 
+                       text_self_attention_masks=text_self_attention_masks, 
+                       attention_mask=attention_mask, token_type_ids=token_type_ids, 
+                       position_ids=position_ids
+        )
+
+        result = model(pixel_values=pixel_values, 
+                       input_ids=input_ids, text_token_mask=text_token_mask, 
+                       text_self_attention_masks=text_self_attention_masks, 
+                       attention_mask=attention_mask, token_type_ids=token_type_ids, 
+                       position_ids=position_ids
+        )
+        
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
-    def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+    def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels):
         model = GroundingDINOForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, 
+                       input_ids=input_ids, text_token_mask=text_token_mask, 
+                       text_self_attention_masks=text_self_attention_masks, 
+                       attention_mask=attention_mask, token_type_ids=token_type_ids, 
+                       position_ids=position_ids
+        )
+        result = model(pixel_values=pixel_values, 
+                       input_ids=input_ids, text_token_mask=text_token_mask, 
+                       text_self_attention_masks=text_self_attention_masks, 
+                       attention_mask=attention_mask, token_type_ids=token_type_ids, 
+                       position_ids=position_ids
+        )
 
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, 
+                       input_ids=input_ids, text_token_mask=text_token_mask, 
+                       text_self_attention_masks=text_self_attention_masks, 
+                       attention_mask=attention_mask, token_type_ids=token_type_ids, 
+                       position_ids=position_ids, labels=labels
+        )
 
         self.parent.assertEqual(result.loss.shape, ())
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))

From 9fba8c2470afb14a33beeae4072072b7c445eb43 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 15 Oct 2023 19:36:12 +0200
Subject: [PATCH 087/252] Fix more tests

---
 .../grounding_dino/modeling_grounding_dino.py | 52 ++++++++++++-------
 .../test_modeling_grounding_dino.py           | 31 +++++++----
 2 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b4e99fa6a776fc..368830354e8eb7 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -898,18 +898,24 @@ def forward(
 
 
 class GroundingDINOBiMultiHeadAttention(nn.Module):
-    def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1):
+    def __init__(self, config):
         super().__init__()
 
+        vision_dim = text_dim = config.d_model
+        embed_dim = config.encoder_ffn_dim // 2
+        num_heads = config.encoder_attention_heads // 2
+        dropout = config.fusion_dropout
+
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
         self.vision_dim = vision_dim
         self.text_dim = text_dim
 
-        assert (
-            self.head_dim * self.num_heads == self.embed_dim
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
         self.scale = self.head_dim ** (-0.5)
         self.dropout = dropout
 
@@ -958,8 +964,6 @@ def forward(
         Returns:
             _type_: _description_
         """
-        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
-        #     import ipdb; ipdb.set_trace()
         bsz, tgt_len, _ = vision_features.size()
 
         vision_query_states = self.vision_proj(vision_features) * self.scale
@@ -1097,13 +1101,7 @@ def __init__(self, config, init_values=1e-4):
         # pre layer norm
         self.layer_norm_vision = nn.LayerNorm(config.d_model)
         self.layer_norm_text = nn.LayerNorm(config.d_model)
-        self.attn = GroundingDINOBiMultiHeadAttention(
-            vision_dim=config.d_model,
-            text_dim=config.d_model,
-            embed_dim=config.encoder_ffn_dim // 2,
-            num_heads=config.encoder_attention_heads // 2,
-            dropout=config.fusion_dropout,
-        )
+        self.attn = GroundingDINOBiMultiHeadAttention(config)
 
         # add layer scale for training stability
         self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
@@ -1241,6 +1239,9 @@ def sine_func(x: torch.Tensor):
 class GroundingDINOEncoderLayer(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
+
+        self.d_model = config.d_model
+
         self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
         self.fusion_layer = GroundingDINOFusionLayer(config)
         self.deformable_layer = GroundingDINODeformableLayer(config)
@@ -1248,15 +1249,21 @@ def __init__(self, config) -> None:
     def get_text_position_embeddings(
         self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor
     ) -> Tensor:
-        bs, n_text, text_dim = text_features.shape
+        batch_size, seq_length, _ = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
             text_position_embedding = (
-                torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1)
+                torch.arange(seq_length, device=text_features.device)
+                .float()
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .repeat(batch_size, 1, 1)
+            )
+            text_position_embedding = get_sine_pos_embed(
+                text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False
             )
-            text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False)
         if text_position_ids is not None:
             text_position_embedding = get_sine_pos_embed(
-                text_position_ids[..., None], num_pos_feats=256, exchange_xy=False
+                text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False
             )
 
         return text_position_embedding
@@ -2258,6 +2265,13 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
         text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
 
         max_text_len = self.config.max_text_len
@@ -2517,8 +2531,8 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor,
-        token_type_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor = None,
+        token_type_ids: torch.LongTensor = None,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
         labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index b4c35ba7bda906..20f0a23fb42316 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -33,7 +33,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -59,7 +59,7 @@ def __init__(
         hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
-        intermediate_size=4,
+        intermediate_size=8,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
@@ -71,6 +71,7 @@ def __init__(
         num_feature_levels=4,
         encoder_n_points=2,
         decoder_n_points=6,
+        max_text_len=256,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -91,6 +92,7 @@ def __init__(
         self.num_feature_levels = num_feature_levels
         self.encoder_n_points = encoder_n_points
         self.decoder_n_points = decoder_n_points
+        self.max_text_len = max_text_len
 
         # we also set the expected seq length for both encoder and decoder
         self.encoder_seq_length = (
@@ -103,9 +105,10 @@ def __init__(
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
         pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
 
+        input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels)
+
         labels = None
         if self.use_labels:
             # labels is a list of Dict (each Dict being the labels for a given example in the batch)
@@ -120,7 +123,7 @@ def prepare_config_and_inputs(self):
                 labels.append(target)
 
         config = self.get_config()
-        return config, pixel_values, pixel_mask, labels
+        return config, pixel_values, pixel_mask, input_ids, labels
 
     def get_config(self):
         resnet_config = ResNetConfig(
@@ -150,35 +153,38 @@ def get_config(self):
             decoder_n_points=self.decoder_n_points,
             use_timm_backbone=False,
             backbone_config=resnet_config,
+            max_text_len=self.max_text_len,
         )
 
     def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        config, pixel_values, pixel_mask, input_ids, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids}
         return config, inputs_dict
 
-    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels):
+    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, labels):
         model = GroundingDINOModel(config=config)
         model.to(torch_device)
         model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids)
         result = model(pixel_values)
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
-    def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+    def create_and_check_grounding_dino_object_detection_head_model(
+        self, config, pixel_values, pixel_mask, input_ids, labels
+    ):
         model = GroundingDINOForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids)
         result = model(pixel_values)
 
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels)
 
         self.parent.assertEqual(result.loss.shape, ())
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
@@ -203,6 +209,9 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
+        for k, v in inputs_dict.items():
+            print(k, v.shape)
+
         if return_labels:
             if model_class.__name__ == "GroundingDINOForObjectDetection":
                 labels = []

From 684a0bb05a57d3ad0d11821788a046068c1e8448 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 15 Oct 2023 19:45:44 +0200
Subject: [PATCH 088/252] More improvements

---
 .../configuration_grounding_dino.py           | 21 ++++++++----------
 .../grounding_dino/modeling_grounding_dino.py | 12 +++++-----
 .../test_modeling_grounding_dino.py           | 22 ++++++-------------
 3 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 8ba34f727243b0..3a1740ceebcf27 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -28,22 +28,20 @@
 }
 
 
-# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet
 class GroundingDINOTextPrenetConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a
-    [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments,
-    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+    This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`].
+            `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -63,8 +61,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or
-            [`TFGroundingDINOTextPrenetModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`].
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         pad_token_id (`int`, *optional*, defaults to 0):
@@ -153,7 +150,7 @@ class GroundingDINOConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a
     Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
-    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+    [idea-research/grounding-dino-tiny](https://huggingface.co/idea-research/grounding-dino-tiny) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -267,10 +264,10 @@ class GroundingDINOConfig(PretrainedConfig):
     ```python
     >>> from transformers import GroundingDINOConfig, GroundingDINOModel
 
-    >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration
+    >>> # Initializing a Grounding DINO idea-research/grounding-dino-tiny style configuration
     >>> configuration = GroundingDINOConfig()
 
-    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> # Initializing a model (with random weights) from the idea-research/grounding-dino-tiny style configuration
     >>> model = GroundingDINOModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 368830354e8eb7..81e5e8ce22b97a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2226,8 +2226,8 @@ def forward(
         self,
         pixel_values: Tensor,
         input_ids: Tensor,
-        token_type_ids: Tensor,
-        attention_mask: Tensor,
+        token_type_ids: Tensor = None,
+        attention_mask: Tensor = None,
         pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
@@ -2247,8 +2247,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
-        >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr")
+        >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> model = GroundingDINOModel.from_pretrained("idea-research/grounding-dino-tiny")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
 
@@ -2559,8 +2559,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
-        >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr")
+        >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 20f0a23fb42316..25b24786ab2054 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -209,9 +209,6 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
-        for k, v in inputs_dict.items():
-            print(k, v.shape)
-
         if return_labels:
             if model_class.__name__ == "GroundingDINOForObjectDetection":
                 labels = []
@@ -413,7 +410,6 @@ def recursive_check(tuple_object, dict_object):
                 recursive_check(tuple_output, dict_output)
 
         for model_class in self.all_model_classes:
-            print("Model class:", model_class)
             model = model_class(config)
             model.to(torch_device)
             model.eval()
@@ -494,17 +490,13 @@ def test_forward_signature(self):
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
+            expected_arg_names = ["pixel_values", "input_ids"]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" in arg_names
+                else []
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
     def test_different_timm_backbone(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 3b2d57612dcb716889f338bc8bba6e4c11bea39b Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 15 Oct 2023 19:53:11 +0200
Subject: [PATCH 089/252] More cleanup

---
 .../grounding_dino/modeling_grounding_dino.py | 70 ++-----------------
 1 file changed, 4 insertions(+), 66 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 81e5e8ce22b97a..eff5899f0fb6e1 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -41,7 +41,7 @@
 )
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
+    BaseModelOutputWithPooling,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
@@ -3660,18 +3660,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
-    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
-    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
-    """
-
     config_class = GroundingDINOTextPrenetConfig
 
     def __init__(self, config, add_pooling_layer=True):
@@ -3708,45 +3696,16 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -3760,11 +3719,8 @@ def forward(
         batch_size, seq_length = input_shape
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
         if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
 
         if token_type_ids is None:
             if hasattr(self.embeddings, "token_type_ids"):
@@ -3778,17 +3734,6 @@ def forward(
         # ourselves in which case we just need to make it broadcastable to all heads.
         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
 
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -3801,16 +3746,11 @@ def forward(
             position_ids=position_ids,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
         )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,
             head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -3821,11 +3761,9 @@ def forward(
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPoolingAndCrossAttentions(
+        return BaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
         )

From 88e5d0201d8c5fc06b2e1b13751847776d7ba431 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 15 Oct 2023 20:09:52 +0200
Subject: [PATCH 090/252] More improvements

---
 .../grounding_dino/convert_grounding_dino_to_hf.py |  2 +-
 .../grounding_dino/modeling_grounding_dino.py      | 13 ++++++-------
 .../grounding_dino/test_modeling_grounding_dino.py | 14 ++++++--------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index a0d0d205454217..2ddfcf34b80615 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -252,7 +252,7 @@ def create_rename_keys(state_dict, config):
             rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision")))
         #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
         if "feat_map" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text")))
+            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection")))
         #### DECODER REFERENCE POINT HEAD
         if "transformer.decoder.ref_point_head" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head",
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index eff5899f0fb6e1..b160e2b252988b 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1511,8 +1511,8 @@ def _init_weights(self, module):
                 if p.dim() > 1:
                     nn.init.xavier_uniform_(p)
         elif isinstance(module, GroundingDINOModel):
-            nn.init.constant_(module.input_proj_text.bias.data, 0)
-            nn.init.xavier_uniform_(module.input_proj_text.weight.data)
+            nn.init.constant_(module.text_projection.bias.data, 0)
+            nn.init.xavier_uniform_(module.text_projection.weight.data)
             for proj in module.input_proj_vision:
                 nn.init.xavier_uniform_(proj[0].weight, gain=1)
                 nn.init.constant_(proj[0].bias, 0)
@@ -2108,7 +2108,7 @@ def __init__(self, config: GroundingDINOConfig):
 
         # Create text backbone
         self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config)
-        self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
+        self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
             self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
@@ -2118,6 +2118,8 @@ def __init__(self, config: GroundingDINOConfig):
 
         self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
 
+        print("Two stage:", config.two_stage)
+
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
@@ -2286,7 +2288,7 @@ def forward(
         text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
             "last_hidden_state"
         ]
-        text_features = self.input_proj_text(text_features)
+        text_features = self.text_projection(text_features)
 
         batch_size, num_channels, height, width = pixel_values.shape
         device = pixel_values.device
@@ -3223,9 +3225,6 @@ def forward(
         return embeddings
 
 
-# Classes for Text Backbone (It's just a BERT model)
-
-
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
 class GroundingDINOTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 25b24786ab2054..59ebb0cdb6b2f0 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -161,7 +161,7 @@ def prepare_config_and_inputs_for_common(self):
         inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids}
         return config, inputs_dict
 
-    def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, labels):
+    def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels):
         model = GroundingDINOModel(config=config)
         model.to(torch_device)
         model.eval()
@@ -171,9 +171,7 @@ def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
-    def create_and_check_grounding_dino_object_detection_head_model(
-        self, config, pixel_values, pixel_mask, input_ids, labels
-    ):
+    def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels):
         model = GroundingDINOForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
@@ -244,13 +242,13 @@ def test_config(self):
         self.config_tester.create_and_test_config_with_num_labels()
         self.config_tester.check_config_can_be_init_without_params()
 
-    def test_grounding_dino_model(self):
+    def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs)
+        self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_grounding_dino_object_detection_head_model(self):
+    def test_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs)
+        self.model_tester.create_and_check_object_detection_head_model(*config_and_inputs)
 
     @unittest.skip(reason="Grounding DINO does not use inputs_embeds")
     def test_inputs_embeds(self):

From 8bae1bd61ff9c1ff9cd8b2a220443437e8c8569f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 16 Oct 2023 17:00:11 -0300
Subject: [PATCH 091/252] Fixed tests, improved modeling and config

---
 .../configuration_grounding_dino.py           | 12 +-----
 .../grounding_dino/modeling_grounding_dino.py | 39 +++++++++----------
 .../test_modeling_grounding_dino.py           | 19 ++++-----
 3 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 3a1740ceebcf27..6b1f6c1913e7e2 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -168,8 +168,7 @@ class GroundingDINOConfig(PretrainedConfig):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
-            `two_stage_num_proposals` instead.
+            [`GroundingDINOModel`] can detect in a single image.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         encoder_ffn_dim (`int`, *optional*, defaults to 2048):
@@ -217,8 +216,6 @@ class GroundingDINOConfig(PretrainedConfig):
         two_stage (`bool`, *optional*, defaults to `True`):
             Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
             Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
-        two_stage_num_proposals (`int`, *optional*, defaults to 900):
-            The number of region proposals to be generated, in case `two_stage` is set to `True`.
         with_box_refine (`bool`, *optional*, defaults to `True`):
             Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
             based on the predictions from the previous layer.
@@ -254,9 +251,6 @@ class GroundingDINOConfig(PretrainedConfig):
         two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
             Whether to share the bbox embedding between the two-stage bbox generator and the region proposal
             generation.
-        two_stage_class_embed_share (`bool`, *optional*, defaults to `False`):
-            Whether to share the class embedding between the two-stage bbox generator and the region proposal
-            generation.
         positional_embedding_temperature (`float`, *optional*, defaults to 20):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
     Examples:
@@ -307,7 +301,6 @@ def __init__(
         encoder_n_points=4,
         decoder_n_points=4,
         two_stage=True,
-        two_stage_num_proposals=900,
         with_box_refine=True,
         class_cost=1,
         bbox_cost=5,
@@ -325,7 +318,6 @@ def __init__(
         query_dim=4,
         decoder_bbox_embed_share=True,
         two_stage_bbox_embed_share=False,
-        two_stage_class_embed_share=False,
         positional_embedding_temperature=20,
         **kwargs,
     ):
@@ -365,7 +357,6 @@ def __init__(
         self.encoder_n_points = encoder_n_points
         self.decoder_n_points = decoder_n_points
         self.two_stage = two_stage
-        self.two_stage_num_proposals = two_stage_num_proposals
         self.with_box_refine = with_box_refine
         if two_stage is True and with_box_refine is False:
             raise ValueError("If two_stage is True, with_box_refine must be True.")
@@ -397,7 +388,6 @@ def __init__(
         self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
         if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
             raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
-        self.two_stage_class_embed_share = two_stage_class_embed_share
         self.positional_embedding_temperature = positional_embedding_temperature
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b160e2b252988b..36c631c9a85d12 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -284,7 +284,7 @@ class GroundingDINOModelOutput(ModelOutput):
             sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
             attention softmax, used to compute the weighted average in the bi-attention heads.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
-            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
             foreground and background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
@@ -387,7 +387,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
-            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are
             picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
             foreground and background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
@@ -1521,6 +1521,9 @@ def _init_weights(self, module):
             nn.init.constant_(module.reference_points.bias.data, 0.0)
         if hasattr(module, "level_embed"):
             nn.init.normal_(module.level_embed)
+        if isinstance(module, GroundingDINOMLPPredictionHead):
+            nn.init.constant_(module.layers[-1].weight.data, 0)
+            nn.init.constant_(module.layers[-1].bias.data, 0)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, GroundingDINODecoder):
@@ -2123,8 +2126,14 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
-            self.encoder_output_bbox_embed = None
-            self.encoder_output_class_embed = None
+            if config.two_stage_bbox_embed_share and config.decoder_bbox_embed_share and self.decoder.bbox_embed is not None:
+                self.encoder_output_bbox_embed = self.decoder.bbox_embed
+            else:
+                self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
+
+            self.encoder_output_class_embed = GroundingDINOContrastiveEmbedding(config)
         else:
             self.reference_points = nn.Embedding(config.num_queries, 4)
 
@@ -2403,8 +2412,8 @@ def forward(
             delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
             enc_outputs_coord_logits = delta_bbox + output_proposals
 
-            # only keep top scoring `config.two_stage_num_proposals` proposals
-            topk = self.config.two_stage_num_proposals
+            # only keep top scoring `config.num_queries` proposals
+            topk = self.config.num_queries
             topk_logits = enc_outputs_class.max(-1)[0]
             topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
             topk_coords_logits = torch.gather(
@@ -2492,9 +2501,6 @@ def __init__(self, config: GroundingDINOConfig):
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
-        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
-        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
-
         if config.decoder_bbox_embed_share:
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
@@ -2504,18 +2510,6 @@ def __init__(self, config: GroundingDINOConfig):
         self.model.decoder.bbox_embed = self.bbox_embed
         self.model.decoder.class_embed = self.class_embed
 
-        if config.two_stage:
-            if config.two_stage_bbox_embed_share:
-                self.model.encoder_output_bbox_embed = _bbox_embed
-            else:
-                self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed)
-
-            # TODO don't believe this is necessary since class_embed has no parameters
-            if config.two_stage_class_embed_share:
-                self.model.encoder_output_class_embed = _class_embed
-            else:
-                self.model.encoder_output_class_embed = copy.deepcopy(_class_embed)
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -2584,6 +2578,9 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
         # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
         outputs = self.model(
             pixel_values=pixel_values,
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 59ebb0cdb6b2f0..fc398b0822594b 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,7 +20,7 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
     require_timm,
@@ -126,13 +126,12 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, input_ids, labels
 
     def get_config(self):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
+        swin_config = SwinConfig(
+            window_size=7,
+            embed_dim=96,
+            depths=[2, 2, 18, 2],
+            num_heads=[3, 6, 12, 24],
+            image_size=self.image_size,
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
@@ -152,7 +151,7 @@ def get_config(self):
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
             use_timm_backbone=False,
-            backbone_config=resnet_config,
+            backbone_config=swin_config,
             max_text_len=self.max_text_len,
         )
 
@@ -167,7 +166,6 @@ def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, la
         model.eval()
 
         result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids)
-        result = model(pixel_values)
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
@@ -177,7 +175,6 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix
         model.eval()
 
         result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids)
-        result = model(pixel_values)
 
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))

From f343f78f4af2f26c53b0d00036ece10cfac0cc46 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 17 Oct 2023 00:41:14 -0300
Subject: [PATCH 092/252] More improvements and fixing tests

---
 .../grounding_dino/modeling_grounding_dino.py | 200 +++++-------------
 .../test_modeling_grounding_dino.py           |  32 +--
 2 files changed, 69 insertions(+), 163 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 36c631c9a85d12..9e657f168d3638 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -152,27 +152,17 @@ class GroundingDINODecoderOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
             plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the self-attention heads.
-        vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
-            used to compute the weighted average in the cross-attention heads.
-        text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax,
-            used to compute the weighted average in the text cross-attention heads.
+            the self-attention, cross-attention and multi-scale deformable attention heads.
     """
 
     last_hidden_state: torch.FloatTensor = None
     intermediate_hidden_states: torch.FloatTensor = None
     intermediate_reference_points: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
 
 @dataclass
@@ -181,8 +171,6 @@ class GroundingDINOEncoderOutput(ModelOutput):
     Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to:
     - vision and text last hidden states
     - vision and text intermediate hidden states
-    - vision and text attentions
-    - vision and text cross attentions
 
     Args:
         last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -197,32 +185,17 @@ class GroundingDINOEncoderOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
-        attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
-            used to compute the weighted average in the multi-scale deformable attention heads.
-        attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
-            used to compute the weighted average in the self-attention heads.
-        cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
-            attention softmax, used to compute the weighted average in the bi-attention heads.
-        cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
-            attention softmax, used to compute the weighted average in the bi-attention heads.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
     """
 
     last_hidden_state_vision: torch.FloatTensor = None
     last_hidden_state_text: torch.FloatTensor = None
     hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
-    attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    attentions_text: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
 
 @dataclass
@@ -243,18 +216,10 @@ class GroundingDINOModelOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
             plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
-            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
-            average in the self-attention heads.
-        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention, cross-attention and multi-scale deformable attention heads.
         encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -267,21 +232,10 @@ class GroundingDINOModelOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
-        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
-            used to compute the weighted average in the multi-scale deformable attention heads.
-        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
-            used to compute the weighted average in the self-attention heads.
-        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
-            attention softmax, used to compute the weighted average in the bi-attention heads.
-        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
+        encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
             attention softmax, used to compute the weighted average in the bi-attention heads.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are
@@ -296,17 +250,12 @@ class GroundingDINOModelOutput(ModelOutput):
     intermediate_hidden_states: torch.FloatTensor = None
     intermediate_reference_points: torch.FloatTensor = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
     encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
     encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
@@ -340,18 +289,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
             plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
-            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
-            average in the self-attention heads.
-        decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention, cross-attention and multi-scale deformable attention heads.
         encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -364,22 +305,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
-        encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax,
-            used to compute the weighted average in the multi-scale deformable attention heads.
-        encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax,
-            used to compute the weighted average in the self-attention heads.
-        encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the
-            attention softmax, used to compute the weighted average in the bi-attention heads.
-        encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads,
-            sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the
-            attention softmax, used to compute the weighted average in the bi-attention heads.
+        encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -404,17 +333,16 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
     intermediate_hidden_states: Optional[torch.FloatTensor] = None
     intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
     encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
     encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    # encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    # encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
+    # encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
+    # encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
@@ -1704,6 +1632,7 @@ def forward(
 
         encoder_vision_states = () if output_hidden_states else None
         encoder_text_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
         all_attn_fused_text = () if output_attentions else None
         all_attn_fused_vision = () if output_attentions else None
         all_attn_enhanced_text = () if output_attentions else None
@@ -1712,18 +1641,7 @@ def forward(
             if output_hidden_states:
                 encoder_vision_states += (vision_features,)
                 encoder_text_states += (text_features,)
-            # INPUTS FOR ENCODER LAYER
-            #   - vision_features: Tensor,
-            #   - vision_position_embedding: Tensor,
-            #   - spatial_shapes: Tensor,
-            #   - level_start_index: Tensor,
-            #   - key_padding_mask: Tensor,
-            #   - reference_points: Tensor,
-            #   - text_features: Optional[Tensor] = None,
-            #   - text_attention_mask: Optional[Tensor] = None,
-            #   - text_position_embedding: Optional[Tensor] = None,
-            #   - text_self_attention_masks: Optional[Tensor] = None,
-            #   - text_position_ids: Optional[Tensor] = None
+
             (vision_features, text_features), attentions = encoder_layer(
                 vision_features=vision_features,
                 vision_position_embedding=vision_position_embedding,
@@ -1748,14 +1666,14 @@ def forward(
             encoder_vision_states += (vision_features,)
             encoder_text_states += (text_features,)
 
+        if output_attentions:
+            all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
+
         if not return_dict:
             enc_outputs = [
                 vision_features,
                 text_features,
-                all_attn_fused_vision,
-                all_attn_fused_text,
-                all_attn_enhanced_text,
-                all_attn_deformable,
+                all_attns
             ]
             return tuple(v for v in enc_outputs if v is not None)
         return GroundingDINOEncoderOutput(
@@ -1763,10 +1681,7 @@ def forward(
             last_hidden_state_text=text_features,
             hidden_states_vision=encoder_vision_states,
             hidden_states_text=encoder_text_states,
-            cross_attentions_vision=all_attn_fused_vision,
-            cross_attentions_text=all_attn_fused_text,
-            attentions_vision=all_attn_deformable,
-            attentions_text=all_attn_enhanced_text,
+            attentions=all_attns,
         )
 
 
@@ -1899,6 +1814,7 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
+        all_attns = () if output_attentions else None
         all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
         all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
         intermediate = ()
@@ -1998,6 +1914,9 @@ def custom_forward(*inputs):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
+        if output_attentions:
+            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
+
         if not return_dict:
             return tuple(
                 v
@@ -2006,9 +1925,7 @@ def custom_forward(*inputs):
                     intermediate,
                     intermediate_reference_points,
                     all_hidden_states,
-                    all_self_attns,
-                    all_cross_attns_vision,
-                    all_cross_attns_text,
+                    all_attns,
                 ]
                 if v is not None
             )
@@ -2017,9 +1934,7 @@ def custom_forward(*inputs):
             intermediate_hidden_states=intermediate,
             intermediate_reference_points=intermediate_reference_points,
             hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            vision_cross_attentions=all_cross_attns_vision,
-            text_cross_attentions=all_cross_attns_text,
+            attentions=all_attns,
         )
 
 
@@ -2388,10 +2303,7 @@ def forward(
                 last_hidden_state_text=encoder_outputs[1],
                 hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
                 hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
-                attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
-                attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None,
-                cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None,
-                cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None,
+                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
             )
 
         # Fifth, prepare decoder inputs
@@ -2463,16 +2375,11 @@ def forward(
             intermediate_reference_points=decoder_outputs.intermediate_reference_points,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
-            decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions,
-            decoder_cross_attentions_text=decoder_outputs.text_cross_attentions,
             encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
             encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
             encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
             encoder_hidden_states_text=encoder_outputs.hidden_states_text,
-            encoder_attentions_vision=encoder_outputs.attentions_vision,
-            encoder_attentions_text=encoder_outputs.attentions_text,
-            encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision,
-            encoder_cross_attentions_text=encoder_outputs.cross_attentions_text,
+            encoder_attentions=encoder_outputs.attentions,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
         )
@@ -2487,7 +2394,7 @@ def forward(
 )
 class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
     def __init__(self, config: GroundingDINOConfig):
         super().__init__(config)
@@ -2595,7 +2502,7 @@ def forward(
         )
 
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
-        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9]
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[7]
         init_reference = outputs.init_reference_points if return_dict else outputs[0]
         inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
 
@@ -2686,16 +2593,11 @@ def forward(
             last_hidden_state=outputs.last_hidden_state,
             decoder_hidden_states=outputs.decoder_hidden_states,
             decoder_attentions=outputs.decoder_attentions,
-            decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision,
-            decoder_cross_attentions_text=outputs.decoder_cross_attentions_text,
             encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
             encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
             encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
             encoder_hidden_states_text=outputs.encoder_hidden_states_text,
-            encoder_attentions_vision=outputs.encoder_attentions_vision,
-            encoder_attentions_text=outputs.encoder_attentions_text,
-            encoder_cross_attentions_text=outputs.encoder_cross_attentions_text,
-            encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision,
+            encoder_attentions=outputs.encoder_attentions,
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
             init_reference_points=outputs.init_reference_points,
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index fc398b0822594b..8592f9036dac10 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -176,13 +176,13 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix
 
         result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids)
 
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
         result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels)
 
         self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
@@ -280,7 +280,7 @@ def test_attention_outputs(self):
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
+            attentions = outputs.encoder_attentions[-1]
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
             # check that output_attentions also work using config
@@ -291,7 +291,7 @@ def test_attention_outputs(self):
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
+            attentions = outputs.encoder_attentions[-1]
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
             self.assertListEqual(
@@ -304,7 +304,7 @@ def test_attention_outputs(self):
             )
             out_len = len(outputs)
 
-            correct_outlen = 8
+            correct_outlen = 10
 
             # loss is at first position
             if "labels" in inputs_dict:
@@ -316,7 +316,7 @@ def test_attention_outputs(self):
             self.assertEqual(out_len, correct_outlen)
 
             # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
+            decoder_attentions = outputs.decoder_attentions[0]
             self.assertIsInstance(decoder_attentions, (list, tuple))
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
@@ -325,7 +325,7 @@ def test_attention_outputs(self):
             )
 
             # cross attentions
-            cross_attentions = outputs.cross_attentions
+            cross_attentions = outputs.decoder_attentions[-1]
             self.assertIsInstance(cross_attentions, (list, tuple))
             self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
@@ -349,12 +349,12 @@ def test_attention_outputs(self):
             if hasattr(self.model_tester, "num_hidden_states_types"):
                 added_hidden_states = self.model_tester.num_hidden_states_types
             elif self.is_encoder_decoder:
-                added_hidden_states = 2
+                added_hidden_states = 3
             else:
                 added_hidden_states = 1
             self.assertEqual(out_len + added_hidden_states, len(outputs))
 
-            self_attentions = outputs.encoder_attentions
+            self_attentions = outputs.encoder_attentions[-1]
 
             self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
@@ -409,18 +409,22 @@ def recursive_check(tuple_object, dict_object):
             model.to(torch_device)
             model.eval()
 
+            print("Done 1")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
+            print("Done 2")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
+            print("Done 3")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
+            print("Done 4")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
@@ -458,15 +462,15 @@ def test_retain_grad_hidden_states_attentions(self):
         # we take the second output since last_hidden_state is the second item
         output = outputs[1]
 
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states = outputs.encoder_hidden_states_vision[0]
+        encoder_attentions = outputs.encoder_attentions[0][0]
         encoder_hidden_states.retain_grad()
         encoder_attentions.retain_grad()
 
-        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions = outputs.decoder_attentions[0][0]
         decoder_attentions.retain_grad()
 
-        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions = outputs.decoder_attentions[-1][0]
         cross_attentions.retain_grad()
 
         output.flatten()[0].backward(retain_graph=True)
@@ -510,7 +514,7 @@ def test_different_timm_backbone(self):
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
-                    self.model_tester.num_labels,
+                    config.max_text_len,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
 

From 033d9039bbaa15f641b90987d6c752a36bb01f9e Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 18 Oct 2023 17:48:59 -0300
Subject: [PATCH 093/252] Improved tests and modeling

---
 .../configuration_grounding_dino.py           | 15 +++--
 .../convert_grounding_dino_to_hf.py           |  2 +-
 .../test_modeling_grounding_dino.py           | 66 ++++++-------------
 3 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 6b1f6c1913e7e2..869028e3cc2514 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -370,11 +370,16 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        self.text_backbone_config = (
-            GroundingDINOTextPrenetConfig()
-            if text_backbone_config is None
-            else GroundingDINOTextPrenetConfig(**text_backbone_config)
-        )
+        if text_backbone_config is None:
+            self.text_backbone_config = GroundingDINOTextPrenetConfig()
+        elif isinstance(text_backbone_config, dict):
+            self.text_backbone_config = GroundingDINOTextPrenetConfig(**text_backbone_config)
+        elif isinstance(text_backbone_config, GroundingDINOTextPrenetConfig):
+            self.text_backbone_config = text_backbone_config
+        else:
+            raise ValueError(
+                f"`text_backbone_config` should be either a `dict` or a `GroundingDINOTextPrenetConfig` instance instead got {type(text_backbone_config)}"
+            )
         self.max_text_len = max_text_len
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 2ddfcf34b80615..3d362d8e92dd62 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -391,7 +391,7 @@ def convert_grounding_dino_checkpoint(args):
         [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
     )
     assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3)
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 8592f9036dac10..095d768b886ff0 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,7 +20,7 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available
+from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available, GroundingDINOTextPrenetConfig
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
     require_timm,
@@ -58,14 +58,14 @@ def __init__(
         use_labels=True,
         hidden_size=32,
         num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=8,
+        num_attention_heads=4,
+        intermediate_size=4,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         num_queries=12,
         num_channels=3,
-        image_size=196,
+        image_size=64,
         n_targets=8,
         num_labels=91,
         num_feature_levels=4,
@@ -128,13 +128,20 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         swin_config = SwinConfig(
             window_size=7,
-            embed_dim=96,
-            depths=[2, 2, 18, 2],
-            num_heads=[3, 6, 12, 24],
+            embed_dim=16,
+            depths=[1, 1, 1, 1],
+            num_heads=[1, 1, 1, 1],
             image_size=self.image_size,
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
+        text_backbone = GroundingDINOTextPrenetConfig(
+            hidden_size=8,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            intermediate_size=8,
+            max_position_embeddings=8
+        )
         return GroundingDINOConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -153,6 +160,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=swin_config,
             max_text_len=self.max_text_len,
+            text_backbone_config=text_backbone
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -592,11 +600,9 @@ def test_inference_object_detection_head(self):
         expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
         self.assertEqual(outputs.logits.shape, expected_shape_logits)
 
+        expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device)
         expected_logits = torch.tensor(
-            [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]
+            [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
@@ -607,47 +613,17 @@ def test_inference_object_detection_head(self):
 
         # verify postprocessing
         results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+            outputs, threshold=0.35, target_sizes=[image.size[::-1]]
         )[0]
-        expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
+        expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device)
         expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device)
+        expected_slice_boxes = torch.tensor([491.1074, 198.5045, 292.5861, 350.6499]).to(torch_device)
 
-        self.assertEqual(len(results["scores"]), 5)
+        self.assertEqual(len(results["scores"]), 2)
         self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
         self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
         self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
 
-    def test_inference_object_detection_head_with_box_refine_two_stage(self):
-        model = GroundingDINOForObjectDetection.from_pretrained(
-            "SenseTime/deformable-detr-with-box-refine-two-stage"
-        ).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
-        pixel_values = encoding["pixel_values"].to(torch_device)
-        pixel_mask = encoding["pixel_mask"].to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(pixel_values, pixel_mask)
-
-        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = torch.tensor(
-            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
-
-        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
-
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         image_processor = self.default_image_processor

From baed29afc84954346d4f61e6dd9de56cecc9eba8 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 21 Oct 2023 01:03:07 -0300
Subject: [PATCH 094/252] Improved tests and added image processor

---
 src/transformers/__init__.py                  |    2 +
 .../models/auto/image_processing_auto.py      |    2 +-
 .../models/grounding_dino/__init__.py         |    2 +
 .../convert_grounding_dino_to_hf.py           |    3 +-
 .../image_processing_grounding_dino.py        | 1401 +++++++++++++++++
 .../processing_grounding_dino.py              |    2 +-
 .../test_modeling_grounding_dino.py           |  117 +-
 7 files changed, 1499 insertions(+), 30 deletions(-)
 create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c73345163a37b9..265b95c17ac5c6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -979,6 +979,7 @@
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
+    _import_structure["models.grounding_dino"].extend(["GroundingDINOImageProcessor"])
     _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
@@ -5074,6 +5075,7 @@
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
         from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
+        from .models.grounding_dino import GroundingDINOImageProcessor
         from .models.idefics import IdeficsImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index cf33369ef5492d..6399fe192616af 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -66,7 +66,7 @@
         ("focalnet", "BitImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
-        ("grounding-dino", "DeformableDetrImageProcessor"),
+        ("grounding-dino", "GroundingDINOImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 229666382564b8..8002244b4287cd 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -24,6 +24,7 @@
         "GroundingDINOTextPrenetConfig",
     ],
     "processing_grounding_dino": ["GroundingDINOProcessor"],
+    "image_processing_grounding_dino": ["GroundingDINOImageProcessor"]
 }
 
 try:
@@ -47,6 +48,7 @@
         GroundingDINOTextPrenetConfig,
     )
     from .processing_grounding_dino import GroundingDINOProcessor
+    from .image_processing_grounding_dino import GroundingDINOImageProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 3d362d8e92dd62..ce48e78e219e8a 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -29,6 +29,7 @@
     GroundingDINOConfig,
     GroundingDINOForObjectDetection,
     GroundingDINOProcessor,
+    GroundingDINOImageProcessor
 )
 
 
@@ -369,7 +370,7 @@ def convert_grounding_dino_checkpoint(args):
     transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
     original_pixel_values = transforms(image).unsqueeze(0)
 
-    image_processor = DeformableDetrImageProcessor()
+    image_processor = GroundingDINOImageProcessor()
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
new file mode 100644
index 00000000000000..44c7a8dabc3f1b
--- /dev/null
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -0,0 +1,1401 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_coco_detection_annotations,
+    valid_coco_panoptic_annotations,
+    valid_images,
+)
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            size = int(round(max_size * min_original_size / max_original_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        return height, width
+
+    if width < height:
+        ow = size
+        oh = int(size * height / width)
+    else:
+        oh = size
+        ow = int(size * width / height)
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DeformableDetr.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints[keep]
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for DeformableDetr.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
+def get_segmentation_image(
+    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
+):
+    h, w = input_size
+    final_h, final_w = target_size
+
+    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+    if m_id.shape[-1] == 0:
+        # We didn't detect any mask :(
+        m_id = np.zeros((h, w), dtype=np.int64)
+    else:
+        m_id = m_id.argmax(-1).reshape(h, w)
+
+    if deduplicate:
+        # Merge the masks corresponding to the same stuff class
+        for equiv in stuff_equiv_classes.values():
+            for eq_id in equiv:
+                m_id[m_id == eq_id] = equiv[0]
+
+    seg_img = id_to_rgb(m_id)
+    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+    return seg_img
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_mask_area
+def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
+    final_h, final_w = target_size
+    np_seg_img = seg_img.astype(np.uint8)
+    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+    m_id = rgb_to_id(np_seg_img)
+    area = [(m_id == i).sum() for i in range(n_classes)]
+    return area
+
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+
+# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample
+def post_process_panoptic_sample(
+    out_logits: np.ndarray,
+    masks: np.ndarray,
+    boxes: np.ndarray,
+    processed_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    is_thing_map: Dict,
+    threshold=0.85,
+) -> Dict:
+    """
+    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
+
+    Args:
+        out_logits (`torch.Tensor`):
+            The logits for this sample.
+        masks (`torch.Tensor`):
+            The predicted segmentation masks for this sample.
+        boxes (`torch.Tensor`):
+            The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
+            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
+        processed_size (`Tuple[int, int]`):
+            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
+            after data augmentation but before batching.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, `(height, width)` corresponding to the requested final size of the
+            prediction.
+        is_thing_map (`Dict`):
+            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
+        threshold (`float`, *optional*, defaults to 0.85):
+            The threshold used to binarize the segmentation masks.
+    """
+    # we filter empty queries and detection below threshold
+    scores, labels = score_labels_from_class_probabilities(out_logits)
+    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
+
+    cur_scores = scores[keep]
+    cur_classes = labels[keep]
+    cur_boxes = center_to_corners_format(boxes[keep])
+
+    if len(cur_boxes) != len(cur_classes):
+        raise ValueError("Not as many boxes as there are classes")
+
+    cur_masks = masks[keep]
+    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
+    cur_masks = safe_squeeze(cur_masks, 1)
+    b, h, w = cur_masks.shape
+
+    # It may be that we have several predicted masks for the same stuff class.
+    # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+    cur_masks = cur_masks.reshape(b, -1)
+    stuff_equiv_classes = defaultdict(list)
+    for k, label in enumerate(cur_classes):
+        if not is_thing_map[label]:
+            stuff_equiv_classes[label].append(k)
+
+    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
+    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
+
+    # We filter out any mask that is too small
+    if cur_classes.size() > 0:
+        # We know filter empty masks as long as we find some
+        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+        while filtered_small.any():
+            cur_masks = cur_masks[~filtered_small]
+            cur_scores = cur_scores[~filtered_small]
+            cur_classes = cur_classes[~filtered_small]
+            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
+            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
+            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+    else:
+        cur_classes = np.ones((1, 1), dtype=np.int64)
+
+    segments_info = [
+        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
+        for i, (cat, a) in enumerate(zip(cur_classes, area))
+    ]
+    del cur_classes
+
+    with io.BytesIO() as out:
+        PIL.Image.fromarray(seg_img).save(out, format="PNG")
+        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+
+    return predictions
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: Dict[str, Any],
+    orig_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`Dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`Tuple[int, int]`):
+            The original size of the input image.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[Set[int]] = None,
+    target_size: Tuple[int, int] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: List[Dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: Dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+class GroundingDINOImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Grounding DINO image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
+            the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            overridden by the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
+    def __init__(
+        self,
+        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: Dict,
+        format: Optional[AnnotionFormat] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into DeformableDetr model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotionFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotionFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
+            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
+            "does not return the image anymore.",
+        )
+        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
+        return image, target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
+    def convert_coco_poly_to_mask(self, *args, **kwargs):
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
+        return convert_coco_poly_to_mask(*args, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
+    def prepare_coco_detection(self, *args, **kwargs):
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
+        return prepare_coco_detection_annotation(*args, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
+    def prepare_coco_panoptic(self, *args, **kwargs):
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
+        return prepare_coco_panoptic_annotation(*args, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> Dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotionFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image.
+            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        max_size = None
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, max_size=max_size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_pad = self.do_pad if do_pad is None else do_pad
+        format = self.format if format is None else format
+
+        if do_resize is not None and size is None:
+            raise ValueError("Size and max_size must be specified if do_resize is True.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        format = AnnotionFormat(format)
+        if annotations is not None:
+            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
+                raise ValueError(
+                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
+                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                    "being a list of annotations in the COCO format."
+                )
+            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
+                raise ValueError(
+                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
+                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                    "the latter being a list of annotations in the COCO format."
+                )
+            elif format not in SUPPORTED_ANNOTATION_FORMATS:
+                raise ValueError(
+                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
+                )
+
+        if (
+            masks_path is not None
+            and format == AnnotionFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+            if annotations is not None:
+                annotations = [
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                    for annotation, image in zip(annotations, images)
+                ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            data = {"pixel_values": images}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+
+        return encoded_inputs
+
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDINO
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+    ):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        # TODO: (amy) add support for other frameworks
+        logits, boxes = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        probs = torch.max(logits, dim=-1)
+        scores = torch.sigmoid(probs.values)
+        labels = probs.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 10fd6e9834a9c3..131eb2c600a1cd 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -40,7 +40,7 @@ class GroundingDINOProcessor(ProcessorMixin):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "DeformableDetrImageProcessor"
+    image_processor_class = "GroundingDINOImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer):
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 095d768b886ff0..d06637a3a36ad5 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoImageProcessor
+    from transformers import AutoImageProcessor, AutoProcessor
 
 
 class GroundingDINOModelTester:
@@ -95,12 +95,15 @@ def __init__(
         self.max_text_len = max_text_len
 
         # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
+        self.encoder_seq_length_vision = (
             math.ceil(self.image_size / 8) ** 2
             + math.ceil(self.image_size / 16) ** 2
             + math.ceil(self.image_size / 32) ** 2
             + math.ceil(self.image_size / 64) ** 2
         )
+
+        self.encoder_seq_length_text = self.max_text_len
+
         self.decoder_seq_length = self.num_queries
 
     def prepare_config_and_inputs(self):
@@ -451,6 +454,66 @@ def recursive_check(tuple_object, dict_object):
                 model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
             )
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states_vision 
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_len = self.model_tester.encoder_seq_length_vision
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_len, self.model_tester.hidden_size],
+            )
+
+            hidden_states = outputs.encoder_hidden_states_text
+
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_len = self.model_tester.encoder_seq_length_text
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_len, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
     def test_retain_grad_hidden_states_attentions(self):
         # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
 
@@ -576,28 +639,31 @@ def prepare_img():
     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
     return image
 
+def prepare_text():
+    text = "a cat."
+    return text
+
 
 @require_timm
 @require_vision
 @slow
 class GroundingDINOModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
+    def default_processor(self):
+        return AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") if is_vision_available() else None
 
     def test_inference_object_detection_head(self):
-        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
+        model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device)
 
-        image_processor = self.default_image_processor
+        processor = self.default_processor
         image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
-        pixel_values = encoding["pixel_values"].to(torch_device)
-        pixel_mask = encoding["pixel_mask"].to(torch_device)
+        text = prepare_text()
+        encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
-            outputs = model(pixel_values, pixel_mask)
+            outputs = model(**encoding)
 
-        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model))
         self.assertEqual(outputs.logits.shape, expected_shape_logits)
 
         expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device)
@@ -605,50 +671,47 @@ def test_inference_object_detection_head(self):
             [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         ).to(torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3))
 
         expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
         self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = image_processor.post_process_object_detection(
+        results = processor.image_processor.post_process_object_detection(
             outputs, threshold=0.35, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device)
-        expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = torch.tensor([491.1074, 198.5045, 292.5861, 350.6499]).to(torch_device)
+        expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device)
 
         self.assertEqual(len(results["scores"]), 2)
-        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
         self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
 
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
-        image_processor = self.default_image_processor
+        processor = self.default_processor
         image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="pt")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
+        text = prepare_text()
+        encoding = processor(images=image, text=text, return_tensors="pt")
 
         # 1. run model on CPU
-        model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale")
+        model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         with torch.no_grad():
-            cpu_outputs = model(pixel_values, pixel_mask)
+            cpu_outputs = model(**encoding)
 
         # 2. run model on GPU
         model.to("cuda")
-
+        encoding = {key: value.to("cuda") for key, value in encoding.items()}
         with torch.no_grad():
-            gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda"))
+            gpu_outputs = model(**encoding)
 
         # 3. assert equivalence
         for key in cpu_outputs.keys():
             assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
 
         expected_logits = torch.tensor(
-            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+            [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         )
-        assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
+        assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3)

From 50c5f67da35945ae11ebc276fe0862de839b88f9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 21 Oct 2023 23:47:59 -0300
Subject: [PATCH 095/252] Improved tests inference

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index d06637a3a36ad5..4bea3e4f4bc817 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -686,7 +686,7 @@ def test_inference_object_detection_head(self):
 
         self.assertEqual(len(results["scores"]), 2)
         self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
-        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
 
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):

From d2922e1a25e687aec4a249f3ae8a52d77a31efdf Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 23 Oct 2023 00:25:34 -0300
Subject: [PATCH 096/252] More improvements

---
 .../models/grounding_dino/modeling_grounding_dino.py  | 11 ++++++++---
 .../grounding_dino/test_modeling_grounding_dino.py    |  4 ----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 9e657f168d3638..f918eebd1457d4 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1673,6 +1673,8 @@ def forward(
             enc_outputs = [
                 vision_features,
                 text_features,
+                encoder_vision_states,
+                encoder_text_states,
                 all_attns
             ]
             return tuple(v for v in enc_outputs if v is not None)
@@ -2501,8 +2503,11 @@ def forward(
             return_dict=return_dict,
         )
 
+        # index for encoder_last_hidden_state_text
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
+
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
-        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[7]
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
         init_reference = outputs.init_reference_points if return_dict else outputs[0]
         inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
 
@@ -2561,8 +2566,8 @@ def forward(
                 auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
                 outputs_loss["auxiliary_outputs"] = auxiliary_outputs
             if self.config.two_stage:
-                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
-                outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+                enc_outputs_coord = outputs[-1].sigmoid()
+                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
 
             loss_dict = criterion(outputs_loss, labels)
             # Fourth: compute total loss, as a weighted sum of the various losses
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 4bea3e4f4bc817..69b9dbef5c05b4 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -538,9 +538,6 @@ def test_retain_grad_hidden_states_attentions(self):
         encoder_hidden_states.retain_grad()
         encoder_attentions.retain_grad()
 
-        decoder_attentions = outputs.decoder_attentions[0][0]
-        decoder_attentions.retain_grad()
-
         cross_attentions = outputs.decoder_attentions[-1][0]
         cross_attentions.retain_grad()
 
@@ -548,7 +545,6 @@ def test_retain_grad_hidden_states_attentions(self):
 
         self.assertIsNotNone(encoder_hidden_states.grad)
         self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
         self.assertIsNotNone(cross_attentions.grad)
 
     def test_forward_signature(self):

From 891c34dd362f77d7162611cfa8a66a7d00199535 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 26 Oct 2023 00:08:07 -0300
Subject: [PATCH 097/252] More test improvements

---
 .../configuration_grounding_dino.py           |  8 ++++++
 .../grounding_dino/modeling_grounding_dino.py | 27 ++++++++++++-------
 .../test_modeling_grounding_dino.py           |  2 ++
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 869028e3cc2514..5d74a970cfa2e2 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -75,6 +75,8 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
     Examples:
 
@@ -108,6 +110,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
+        init_std=0.02,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -125,6 +128,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
+        self.init_std = init_std
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -253,6 +257,8 @@ class GroundingDINOConfig(PretrainedConfig):
             generation.
         positional_embedding_temperature (`float`, *optional*, defaults to 20):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     Examples:
 
     ```python
@@ -319,6 +325,7 @@ def __init__(
         decoder_bbox_embed_share=True,
         two_stage_bbox_embed_share=False,
         positional_embedding_temperature=20,
+        init_std=0.02,
         **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
@@ -394,6 +401,7 @@ def __init__(
         if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
             raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
         self.positional_embedding_temperature = positional_embedding_temperature
+        self.init_std = init_std
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f918eebd1457d4..6b23c42eaf644b 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1427,6 +1427,8 @@ class GroundingDINOPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
 
     def _init_weights(self, module):
+        std = self.config.init_std
+
         if isinstance(module, GroundingDINOLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
@@ -1437,21 +1439,26 @@ def _init_weights(self, module):
         elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)):
             for p in module.parameters():
                 if p.dim() > 1:
-                    nn.init.xavier_uniform_(p)
-        elif isinstance(module, GroundingDINOModel):
-            nn.init.constant_(module.text_projection.bias.data, 0)
-            nn.init.xavier_uniform_(module.text_projection.weight.data)
-            for proj in module.input_proj_vision:
-                nn.init.xavier_uniform_(proj[0].weight, gain=1)
-                nn.init.constant_(proj[0].bias, 0)
+                    nn.init.normal_(p, mean=0.0, std=std)
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GroundingDINOMLPPredictionHead):
+            nn.init.constant_(module.layers[-1].weight.data, 0)
+            nn.init.constant_(module.layers[-1].bias.data, 0)
+            
         if hasattr(module, "reference_points") and not self.config.two_stage:
             nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
             nn.init.constant_(module.reference_points.bias.data, 0.0)
         if hasattr(module, "level_embed"):
             nn.init.normal_(module.level_embed)
-        if isinstance(module, GroundingDINOMLPPredictionHead):
-            nn.init.constant_(module.layers[-1].weight.data, 0)
-            nn.init.constant_(module.layers[-1].bias.data, 0)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, GroundingDINODecoder):
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 69b9dbef5c05b4..54faebb8227265 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -600,6 +600,8 @@ def test_initialization(self):
                         if (
                             "level_embed" in name
                             or "sampling_offsets.bias" in name
+                            or "text_param" in name
+                            or "vision_param" in name
                             or "value_proj" in name
                             or "output_proj" in name
                             or "reference_points" in name

From eccaec95d0df167c098bdfe8e9c2ae86aa6c7637 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 26 Oct 2023 17:29:51 -0300
Subject: [PATCH 098/252] Fixed last test

---
 .../test_modeling_grounding_dino.py           | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 54faebb8227265..0f12a2545c6879 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Grounding DINO model. """
 
-
+import collections
 import inspect
 import math
+import re
 import unittest
 from typing import Dict, List, Tuple
 
@@ -41,6 +42,7 @@
     import torch
 
     from transformers import GroundingDINOForObjectDetection, GroundingDINOModel
+    from transformers.pytorch_utils import id_tensor_storage
 
 
 if is_vision_available():
@@ -628,6 +630,41 @@ def test_two_stage_training(self):
         loss = model(**inputs).loss
         loss.backward()
 
+    def test_tied_weights_keys(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.tie_word_embeddings = True
+        for model_class in self.all_model_classes:
+            model_tied = model_class(config)
+
+            ptrs = collections.defaultdict(list)
+            for name, tensor in model_tied.state_dict().items():
+                ptrs[id_tensor_storage(tensor)].append(name)
+
+            # These are all the pointers of shared tensors.
+            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+
+            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
+            # Detect we get a hit for each key
+            for key in tied_weight_keys:
+                if not any(re.search(key, p) for group in tied_params for p in group):
+                    raise ValueError(f"{key} is not a tied weight key for {model_class}.")
+
+            # Removed tied weights found from tied params -> there should only be one left after
+            for key in tied_weight_keys:
+                for i in range(len(tied_params)):
+                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
+
+            # GroundingDINO when sharing weights also uses the shared ones in GroundingDINODecoder
+            # Therefore, differently from DeformableDetr, we expect the group lens to be 2
+            # one for self.bbox_embed in GroundingDINOForObejectDetection and another one
+            # in the decoder
+            tied_params = [group for group in tied_params if len(group) > 2] 
+            self.assertListEqual(
+                tied_params,
+                [],
+                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
+            )
+
 
 TOLERANCE = 1e-4
 

From f32be01ec8795d84aeabac923131acae71f8c652 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 26 Oct 2023 19:13:49 -0300
Subject: [PATCH 099/252] Improved docstrings and comments

---
 .../grounding_dino/modeling_grounding_dino.py | 126 ++++++++----------
 1 file changed, 59 insertions(+), 67 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 6b23c42eaf644b..b299712746df18 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -339,10 +339,6 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
     encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
     encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    # encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    # encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
-    # encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None
-    # encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
@@ -770,7 +766,6 @@ def forward(
         return output, attention_weights
 
 
-# TODO is this an approriate way to name this?
 class GroundingDINOTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
 
@@ -1296,27 +1291,6 @@ def forward(
         self_attn_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ):
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(seq_len, batch, embed_dim)`.
-            position_embeddings (`torch.FloatTensor`, *optional*):
-                Position embeddings that are added to the queries and keys in the self-attention layer.
-            reference_points (`torch.FloatTensor`, *optional*):
-                Reference points.
-            spatial_shapes (`torch.LongTensor`, *optional*):
-                Spatial shapes.
-            level_start_index (`torch.LongTensor`, *optional*):
-                Level start index.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
-                values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
         residual = hidden_states
 
         # Self Attention
@@ -1486,7 +1460,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`]
             for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -1497,18 +1471,31 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
-            Not used by default. Can be used to mask object queries.
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDINOTokenizer.__call__`] for details.
+
+        attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are real (i.e. **not masked**),
+            - 0 for tokens that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: 
+            `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`)
+            `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
-            can choose to directly pass a flattened representation of an image.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
-            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
-            embedded representation.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1594,8 +1581,8 @@ def forward(
                 Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
             vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
-                - 1 for pixel features that are real (i.e. **not masked**),
-                - 0 for pixel features that are padding (i.e. **masked**).
+                - 0 for pixel features that are real (i.e. **not masked**),
+                - 1 for pixel features that are padding (i.e. **masked**).
                 [What are attention masks?](../glossary#attention-mask)
             vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
@@ -1609,8 +1596,8 @@ def forward(
                 Flattened text features that are passed to the encoder.
             text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
                 Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
-                - 1 for text features that are real (i.e. **not masked**),
-                - 0 for text features that are padding (i.e. **masked**).
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
                 [What are attention masks?](../glossary#attention-mask)
             text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
@@ -1700,7 +1687,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel):
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
-    Some tweaks for Deformable DETR:
+    Some tweaks for Grounding DINO:
 
     - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
     - it also returns a stack of intermediate outputs and reference points from all decoding layers.
@@ -1785,14 +1772,18 @@ def forward(
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
                 The query embeddings that are passed into the decoder.
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
-                in `[0, 1]`:
-                - 1 for pixels that are real (i.e. **not masked**),
-                - 0 for pixels that are padding (i.e. **masked**).
+            vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Last hidden state from encoder related to vision feature map.
+            vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+            text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Last hidden state from encoder related to text features.
+            text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
             reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
                 Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
             spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
@@ -1801,7 +1792,10 @@ def forward(
                 Indexes for the start of each feature level. In range `[0, sequence_length]`.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
                 Ratio of valid area in each feature level.
-
+            self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`):
+                Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`:
+                - 1 for queries that are real (i.e. **not masked**),
+                - 0 for queries that are padding (i.e. **masked**).
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -2045,8 +2039,6 @@ def __init__(self, config: GroundingDINOConfig):
 
         self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
 
-        print("Two stage:", config.two_stage)
-
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
@@ -2175,23 +2167,23 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, GroundingDINOModel
+        >>> from transformers import AutoProcessor, GroundingDINOModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny")
-        >>> model = GroundingDINOModel.from_pretrained("idea-research/grounding-dino-tiny")
-
-        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny")
 
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
 
         >>> last_hidden_states = outputs.last_hidden_state
         >>> list(last_hidden_states.shape)
-        [1, 300, 256]
+        [1, 900, 256]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -2464,33 +2456,33 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection
+        >>> from transformers import AutoProcessor, GroundingDINOForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny")
         >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny")
 
-        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
 
         >>> # convert outputs (bounding boxes and class logits) to COCO API
         >>> target_sizes = torch.tensor([image.size[::-1]])
-        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        >>> results = processor.image_processor.post_process_object_detection(outputs, threshold=0.35, target_sizes=target_sizes)[
         ...     0
         ... ]
         >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     print(
-        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"Detected {label.item()} with confidence "
         ...         f"{round(score.item(), 3)} at location {box}"
         ...     )
-        Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
-        Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
-        Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
+        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
+        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

From 1c657e2adca10f5d753d307ab1bfbd0ac4cffdbe Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 27 Oct 2023 20:55:24 +0200
Subject: [PATCH 100/252] Fix style

---
 .../models/grounding_dino/__init__.py         |  4 +-
 .../convert_grounding_dino_to_hf.py           |  3 +-
 .../image_processing_grounding_dino.py        | 12 +--
 .../grounding_dino/modeling_grounding_dino.py | 91 +++++++++----------
 .../utils/dummy_vision_objects.py             |  7 ++
 .../test_modeling_grounding_dino.py           | 28 +++---
 6 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 8002244b4287cd..83622a84513843 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -23,8 +23,8 @@
         "GroundingDINOConfig",
         "GroundingDINOTextPrenetConfig",
     ],
+    "image_processing_grounding_dino": ["GroundingDINOImageProcessor"],
     "processing_grounding_dino": ["GroundingDINOProcessor"],
-    "image_processing_grounding_dino": ["GroundingDINOImageProcessor"]
 }
 
 try:
@@ -47,8 +47,8 @@
         GroundingDINOConfig,
         GroundingDINOTextPrenetConfig,
     )
-    from .processing_grounding_dino import GroundingDINOProcessor
     from .image_processing_grounding_dino import GroundingDINOImageProcessor
+    from .processing_grounding_dino import GroundingDINOProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index ce48e78e219e8a..8883e64814d33b 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -25,11 +25,10 @@
 
 from transformers import (
     AutoTokenizer,
-    DeformableDetrImageProcessor,
     GroundingDINOConfig,
     GroundingDINOForObjectDetection,
+    GroundingDINOImageProcessor,
     GroundingDINOProcessor,
-    GroundingDINOImageProcessor
 )
 
 
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 44c7a8dabc3f1b..6c9d86f5a026b5 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -763,7 +763,7 @@ class GroundingDINOImageProcessor(BaseImageProcessor):
     Constructs a Grounding DINO image processor.
 
     Args:
-        format (`str`, *optional*, defaults to `"coco_detection"`):
+        format (`str`, *optional*, defaults to `AnnotionFormat.COCO_DETECTION`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
             Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
@@ -771,7 +771,7 @@ class GroundingDINOImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
             Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
             the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
@@ -779,9 +779,9 @@ class GroundingDINOImageProcessor(BaseImageProcessor):
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
-        do_normalize:
             Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
             `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
             Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
             channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
@@ -1349,11 +1349,11 @@ def post_process_object_detection(
         self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
     ):
         """
-        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
-        bottom_right_x, bottom_right_y) format.
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format.
 
         Args:
-            outputs ([`OwlViTObjectDetectionOutput`]):
+            outputs ([`GroundingDINOObjectDetectionOutput`]):
                 Raw outputs of the model.
             threshold (`float`, *optional*):
                 Score threshold to keep object detection predictions.
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b299712746df18..8634bcecc536bd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -153,9 +153,9 @@ class GroundingDINODecoderOutput(ModelOutput):
             shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
             plus the initial embedding outputs.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the self-attention, cross-attention and multi-scale deformable attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -186,9 +186,10 @@ class GroundingDINOEncoderOutput(ModelOutput):
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
+            multi-scale deformable attention heads.
     """
 
     last_hidden_state_vision: torch.FloatTensor = None
@@ -217,9 +218,9 @@ class GroundingDINOModelOutput(ModelOutput):
             shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
             plus the initial embedding outputs.
         decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the self-attention, cross-attention and multi-scale deformable attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
         encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -233,14 +234,15 @@ class GroundingDINOModelOutput(ModelOutput):
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
         encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
-            attention softmax, used to compute the weighted average in the bi-attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
+            multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
+            bi-attention heads.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
-            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are
-            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
-            foreground and background).
+            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+            region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+            background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
@@ -290,9 +292,9 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
             plus the initial embedding outputs.
         decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the self-attention, cross-attention and multi-scale deformable attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
         encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -306,9 +308,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
         encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
-            the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads.
+            Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
+            multi-scale deformable attention heads.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -316,9 +319,9 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
-            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are
-            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
-            foreground and background).
+            Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+            region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+            background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
@@ -1427,7 +1430,7 @@ def _init_weights(self, module):
         elif isinstance(module, GroundingDINOMLPPredictionHead):
             nn.init.constant_(module.layers[-1].weight.data, 0)
             nn.init.constant_(module.layers[-1].bias.data, 0)
-            
+
         if hasattr(module, "reference_points") and not self.config.two_stage:
             nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
             nn.init.constant_(module.reference_points.bias.data, 0.0)
@@ -1460,8 +1463,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`]
-            for details.
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] for
+            details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1492,10 +1495,11 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are token type IDs?](../glossary#token-type-ids)
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: 
+            Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
             `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`)
-            `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence
+            of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
+            decoder.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1664,13 +1668,7 @@ def forward(
             all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
 
         if not return_dict:
-            enc_outputs = [
-                vision_features,
-                text_features,
-                encoder_vision_states,
-                encoder_text_states,
-                all_attns
-            ]
+            enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
             return tuple(v for v in enc_outputs if v is not None)
         return GroundingDINOEncoderOutput(
             last_hidden_state_vision=vision_features,
@@ -2042,7 +2040,11 @@ def __init__(self, config: GroundingDINOConfig):
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
             self.enc_output_norm = nn.LayerNorm(config.d_model)
-            if config.two_stage_bbox_embed_share and config.decoder_bbox_embed_share and self.decoder.bbox_embed is not None:
+            if (
+                config.two_stage_bbox_embed_share
+                and config.decoder_bbox_embed_share
+                and self.decoder.bbox_embed is not None
+            ):
                 self.encoder_output_bbox_embed = self.decoder.bbox_embed
             else:
                 self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead(
@@ -2472,15 +2474,12 @@ def forward(
 
         >>> # convert outputs (bounding boxes and class logits) to COCO API
         >>> target_sizes = torch.tensor([image.size[::-1]])
-        >>> results = processor.image_processor.post_process_object_detection(outputs, threshold=0.35, target_sizes=target_sizes)[
-        ...     0
-        ... ]
+        >>> results = processor.image_processor.post_process_object_detection(
+        ...     outputs, threshold=0.35, target_sizes=target_sizes
+        ... )[0]
         >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
         ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(
-        ...         f"Detected {label.item()} with confidence "
-        ...         f"{round(score.item(), 3)} at location {box}"
-        ...     )
+        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
         Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
         Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
         ```"""
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index c0c39b57d096bc..27425117909d3a 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -233,6 +233,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class GroundingDINOImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class IdeficsImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 0f12a2545c6879..220f1a6231ec9c 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -21,7 +21,13 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available, GroundingDINOTextPrenetConfig
+from transformers import (
+    GroundingDINOConfig,
+    GroundingDINOTextPrenetConfig,
+    SwinConfig,
+    is_torch_available,
+    is_vision_available,
+)
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
     require_timm,
@@ -48,7 +54,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoImageProcessor, AutoProcessor
+    from transformers import AutoProcessor
 
 
 class GroundingDINOModelTester:
@@ -141,11 +147,7 @@ def get_config(self):
             out_indices=[2, 3, 4],
         )
         text_backbone = GroundingDINOTextPrenetConfig(
-            hidden_size=8,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            intermediate_size=8,
-            max_position_embeddings=8
+            hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8
         )
         return GroundingDINOConfig(
             d_model=self.hidden_size,
@@ -165,7 +167,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=swin_config,
             max_text_len=self.max_text_len,
-            text_backbone_config=text_backbone
+            text_backbone_config=text_backbone,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -465,7 +467,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = outputs.encoder_hidden_states_vision 
+            hidden_states = outputs.encoder_hidden_states_vision
 
             expected_num_layers = getattr(
                 self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
@@ -515,7 +517,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-
     def test_retain_grad_hidden_states_attentions(self):
         # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
 
@@ -658,7 +659,7 @@ def test_tied_weights_keys(self):
             # Therefore, differently from DeformableDetr, we expect the group lens to be 2
             # one for self.bbox_embed in GroundingDINOForObejectDetection and another one
             # in the decoder
-            tied_params = [group for group in tied_params if len(group) > 2] 
+            tied_params = [group for group in tied_params if len(group) > 2]
             self.assertListEqual(
                 tied_params,
                 [],
@@ -674,6 +675,7 @@ def prepare_img():
     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
     return image
 
+
 def prepare_text():
     text = "a cat."
     return text
@@ -701,7 +703,9 @@ def test_inference_object_detection_head(self):
         expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model))
         self.assertEqual(outputs.logits.shape, expected_shape_logits)
 
-        expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]
+        ).to(torch_device)
         expected_logits = torch.tensor(
             [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         ).to(torch_device)

From 1202ce8ee1217e71386539cb4c7a38bcfc08eb06 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:59:46 -0300
Subject: [PATCH 101/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8634bcecc536bd..87a13d5aa08a39 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -840,7 +840,7 @@ def __init__(self, config):
 
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+                f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
             )
         self.scale = self.head_dim ** (-0.5)
         self.dropout = dropout

From d62dd114afb55b06e7e727cf82fb09dd10f24561 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:00:06 -0300
Subject: [PATCH 102/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 87a13d5aa08a39..b1c198d0fcb829 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -855,8 +855,8 @@ def __init__(self, config):
 
         self._reset_parameters()
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def _reset_parameters(self):
         nn.init.xavier_uniform_(self.vision_proj.weight)

From bbf873b3817850fe85658a2e25b06688aab7ef71 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:00:35 -0300
Subject: [PATCH 103/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b1c198d0fcb829..1fe4357eb02f0a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1945,7 +1945,7 @@ def custom_forward(*inputs):
 def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
     """Generate attention mask between each pair of special tokens and positional ids.
     Args:
-        input_ids (torch.LongTensor): input ids. Shape: [bs, num_token]
+        input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token]
     Returns:
         Tuple[torch.Tensor]: attention mask between each special tokens and position_ids
     """

From c69b8a2da070083c528d311bd301d53b4714b78d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:00:46 -0300
Subject: [PATCH 104/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 1fe4357eb02f0a..77f558cb6cbc6a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1947,7 +1947,7 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
     Args:
         input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token]
     Returns:
-        Tuple[torch.Tensor]: attention mask between each special tokens and position_ids
+        `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids
     """
     bs, num_token = input_ids.shape
     # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens

From 274752c2fda16270fca712168e3e4c64d686ea79 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:01:14 -0300
Subject: [PATCH 105/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 77f558cb6cbc6a..8ca460031f6e11 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2103,9 +2103,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
         """Generate the encoder output proposals from encoded enc_output.
 
         Args:
-            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
-            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
-            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
+            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
+            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
 
         Returns:
             `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.

From 91373e0964f47fcb14968da22a8b1b0d83fb36ac Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 11:06:35 -0300
Subject: [PATCH 106/252] Better naming

---
 .../models/grounding_dino/modeling_grounding_dino.py   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8ca460031f6e11..e696b2137bb644 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1949,9 +1949,9 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
     Returns:
         `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids
     """
-    bs, num_token = input_ids.shape
-    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
-    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    batch_size, num_token = input_ids.shape
+    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
     for special_token in SPECIAL_TOKENS:
         special_tokens_mask |= input_ids == special_token
 
@@ -1959,8 +1959,8 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
     idxs = torch.nonzero(special_tokens_mask)
 
     # generate attention mask and positional ids
-    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
-    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
+    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
     previous_col = 0
     for i in range(idxs.shape[0]):
         row, col = idxs[i]

From 49458838fb3695173586e04932f54a65d5fa7202 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 11:21:57 -0300
Subject: [PATCH 107/252] Better naming

---
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e696b2137bb644..36ae85b86ac9de 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2116,9 +2116,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
         """
         batch_size = enc_output.shape[0]
         proposals = []
-        _cur = 0
+        current_position = 0
         for level, (height, width) in enumerate(spatial_shapes):
-            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(batch_size, height, width, 1)
             valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
             valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 
@@ -2134,7 +2134,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
             width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
             proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
-            _cur += height * width
+            current_position += height * width
 
         output_proposals = torch.cat(proposals, 1)
         output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)

From 5882f5fcb8cd7ac29c0ffc448824faa041940f9b Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 14:58:06 -0300
Subject: [PATCH 108/252] Added Copied statement

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 36ae85b86ac9de..10338735006b50 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -67,7 +67,7 @@
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
-
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
     def forward(

From c96a1a1fcb6a7e6510274a708f05165a7fcd49c3 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 15:06:37 -0300
Subject: [PATCH 109/252] Added Copied statement

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 10338735006b50..5e9d775c9417c6 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -595,7 +595,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
-
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
     value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
 ) -> Tensor:

From 558ad8776f7a610c925098e3f85a30bf6c44e38f Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 15:13:18 -0300
Subject: [PATCH 110/252] Moved param init from
 GroundingDINOBiMultiHeadAttention

---
 .../grounding_dino/modeling_grounding_dino.py | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 5e9d775c9417c6..4a5a13d4a74d4b 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -853,25 +853,9 @@ def __init__(self, config):
         self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
         self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
 
-        self._reset_parameters()
-
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def _reset_parameters(self):
-        nn.init.xavier_uniform_(self.vision_proj.weight)
-        self.vision_proj.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.text_proj.weight)
-        self.text_proj.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.values_vision_proj.weight)
-        self.values_vision_proj.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.values_text_proj.weight)
-        self.values_text_proj.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.out_vision_proj.weight)
-        self.out_vision_proj.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.out_text_proj.weight)
-        self.out_text_proj.bias.data.fill_(0)
-
     def forward(
         self,
         vision_features: Tensor,
@@ -1412,7 +1396,18 @@ def _init_weights(self, module):
         elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
             module._reset_parameters()
         elif isinstance(module, GroundingDINOBiMultiHeadAttention):
-            module._reset_parameters()
+            nn.init.xavier_uniform_(module.vision_proj.weight)
+            module.vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.text_proj.weight)
+            module.text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_vision_proj.weight)
+            module.values_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_text_proj.weight)
+            module.values_text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_vision_proj.weight)
+            module.out_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_text_proj.weight)
+            module.out_text_proj.bias.data.fill_(0)
         elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)):
             for p in module.parameters():
                 if p.dim() > 1:

From 5c32bdc9540176840abe93fee2d338d550e1f2a1 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 15:28:44 -0300
Subject: [PATCH 111/252] Better naming

---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4a5a13d4a74d4b..38683d76ba62ea 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -804,9 +804,9 @@ def forward(
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
 
-        q = k = self.with_pos_embed(hidden_states, position_embeddings)
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
         attention_output, attention_weights = self.self_attn(
-            query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False
+            query=queries, key=keys, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output

From c561087e944d3366b01a35ba5744fb465943be77 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Wed, 1 Nov 2023 15:32:52 -0300
Subject: [PATCH 112/252] Fixing clamp style

---
 .../grounding_dino/modeling_grounding_dino.py   | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 38683d76ba62ea..6d49851104b482 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -904,23 +904,18 @@ def forward(
             )
 
         attn_weights = attn_weights - attn_weights.max()
-
-        attn_weights = torch.clamp(
-            attn_weights, min=-50000
-        )  # Do not increase -50000, data type half has quite limited range
+        # Do not increase -50000/50000, data type half has quite limited range
         attn_weights = torch.clamp(
-            attn_weights, max=50000
-        )  # Do not increase 50000, data type half has quite limited range
+            attn_weights, min=-50000, max=50000
+        ) 
 
         attn_weights_T = attn_weights.transpose(1, 2)
         text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
 
+        # Do not increase -50000/50000, data type half has quite limited range
         text_attn_weights = torch.clamp(
-            text_attn_weights, min=-50000
-        )  # Do not increase -50000, data type half has quite limited range
-        text_attn_weights = torch.clamp(
-            text_attn_weights, max=50000
-        )  # Do not increase 50000, data type half has quite limited range
+            text_attn_weights, min=-50000, max=50000
+        )
 
         # mask vison for language
         if vision_attention_mask is not None:

From 07d4c62dc4adefcb2a27a39634392df824a8e272 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 11:08:34 -0300
Subject: [PATCH 113/252] Better naming

---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 6d49851104b482..16972223f94046 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -909,8 +909,8 @@ def forward(
             attn_weights, min=-50000, max=50000
         ) 
 
-        attn_weights_T = attn_weights.transpose(1, 2)
-        text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
+        attn_weights_transposed = attn_weights.transpose(1, 2)
+        text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0]
 
         # Do not increase -50000/50000, data type half has quite limited range
         text_attn_weights = torch.clamp(

From ba37183c4812a2afc678a0ef417f4d4aeb0d35ce Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:10:34 -0300
Subject: [PATCH 114/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 16972223f94046..02bca0495571e0 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -917,7 +917,7 @@ def forward(
             text_attn_weights, min=-50000, max=50000
         )
 
-        # mask vison for language
+        # mask vision for language
         if vision_attention_mask is not None:
             vision_attention_mask = (
                 vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)

From c746e1d96c9e18c113cd3a1748dc5e3741234c1d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:14:27 -0300
Subject: [PATCH 115/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py  | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 02bca0495571e0..2091a95e88b708 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1156,13 +1156,10 @@ def get_text_position_embeddings(
     ) -> Tensor:
         batch_size, seq_length, _ = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
-            text_position_embedding = (
-                torch.arange(seq_length, device=text_features.device)
-                .float()
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .repeat(batch_size, 1, 1)
-            )
+            text_position_embedding = torch.arange(seq_length, device=text_features.device)
+            text_position_embedding = text_position_embedding.float()
+            text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1)
+            text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1)
             text_position_embedding = get_sine_pos_embed(
                 text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False
             )

From 07b260dd58471d19723510c1fa08c1f786c797b3 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:33:37 -0300
Subject: [PATCH 116/252] Update
 src/transformers/models/grounding_dino/configuration_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/configuration_grounding_dino.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 5d74a970cfa2e2..1d9ba9a25e7082 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -385,7 +385,7 @@ def __init__(
             self.text_backbone_config = text_backbone_config
         else:
             raise ValueError(
-                f"`text_backbone_config` should be either a `dict` or a `GroundingDINOTextPrenetConfig` instance instead got {type(text_backbone_config)}"
+                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextPrenetConfig`. Received {type(text_backbone_config)} instead."
             )
         self.max_text_len = max_text_len
         # Text Enhancer

From 898e0727c73ee79184b8567b1bc2af87a56bc1ba Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:36:18 -0300
Subject: [PATCH 117/252] Update
 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 8883e64814d33b..fade922f8e5370 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -100,7 +100,7 @@ def create_rename_keys(state_dict, config):
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
-            # intermidiate
+            # intermediate
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",

From 34b36a3a68c682b085d2314e99910e190f4cd167 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:37:38 -0300
Subject: [PATCH 118/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2091a95e88b708..9e4aaeaa8f5b9d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -281,7 +281,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
             possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
-        auxiliary_outputs (`list[Dict]`, *optional*):
+        auxiliary_outputs (`List[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.

From e14d6aea74f9857cfb583c2a05c7eeb7785b3e0d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 11:39:42 -0300
Subject: [PATCH 119/252] Improving conversion script

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index fade922f8e5370..0737b060a6e379 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -94,13 +94,11 @@ def create_rename_keys(state_dict, config):
             # attention
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table",
                                 f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index",
-            #                     f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
-            # intermediate
+            # intermidiate
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",
@@ -238,10 +236,6 @@ def create_rename_keys(state_dict, config):
                                target_prefix_decoder + target_name))
     ########################################## DECODER - END
 
-    #TODO convert head
-    ########################################## HEAD - START
-    ########################################## HEAD - END
-
     ########################################## Additional - START
     for layer_name, params in state_dict.items():
         #### TEXT BACKBONE

From f867e5081eb72a13ece9f50e0a6c200dfe5426d0 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 11:44:59 -0300
Subject: [PATCH 120/252] Improved config

---
 .../configuration_grounding_dino.py             | 17 +++++------------
 .../grounding_dino/modeling_grounding_dino.py   |  4 ++--
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 1d9ba9a25e7082..b7fee34a46b262 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -220,9 +220,6 @@ class GroundingDINOConfig(PretrainedConfig):
         two_stage (`bool`, *optional*, defaults to `True`):
             Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
             Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
-        with_box_refine (`bool`, *optional*, defaults to `True`):
-            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
-            based on the predictions from the previous layer.
         class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
         bbox_cost (`float`, *optional*, defaults to 5):
@@ -307,12 +304,11 @@ def __init__(
         encoder_n_points=4,
         decoder_n_points=4,
         two_stage=True,
-        with_box_refine=True,
-        class_cost=1,
-        bbox_cost=5,
-        giou_cost=2,
-        bbox_loss_coefficient=5,
-        giou_loss_coefficient=2,
+        class_cost=1.,
+        bbox_cost=5.,
+        giou_cost=2.,
+        bbox_loss_coefficient=5.,
+        giou_loss_coefficient=2.,
         focal_alpha=0.25,
         disable_custom_kernels=False,
         # other parameters
@@ -364,9 +360,6 @@ def __init__(
         self.encoder_n_points = encoder_n_points
         self.decoder_n_points = decoder_n_points
         self.two_stage = two_stage
-        self.with_box_refine = with_box_refine
-        if two_stage is True and with_box_refine is False:
-            raise ValueError("If two_stage is True, with_box_refine must be True.")
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 9e4aaeaa8f5b9d..c095c0c1911e21 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -318,11 +318,11 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
             Stacked intermediate reference points (reference points of each layer of the decoder).
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
-        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
             region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
             background).
-        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
 

From fc105bee2456ce58272090c3072d4199649bf7b1 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 11:47:43 -0300
Subject: [PATCH 121/252] Improved naming

---
 .../grounding_dino/modeling_grounding_dino.py | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index c095c0c1911e21..fc85e4f212cceb 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -874,21 +874,21 @@ def forward(
         Returns:
             _type_: _description_
         """
-        bsz, tgt_len, _ = vision_features.size()
+        batch_size, tgt_len, _ = vision_features.size()
 
         vision_query_states = self.vision_proj(vision_features) * self.scale
-        vision_query_states = self._shape(vision_query_states, tgt_len, bsz)
+        vision_query_states = self._shape(vision_query_states, tgt_len, batch_size)
 
         text_key_states = self.text_proj(text_features)
-        text_key_states = self._shape(text_key_states, -1, bsz)
+        text_key_states = self._shape(text_key_states, -1, batch_size)
 
         vision_value_states = self.values_vision_proj(vision_features)
-        vision_value_states = self._shape(vision_value_states, -1, bsz)
+        vision_value_states = self._shape(vision_value_states, -1, batch_size)
 
         text_value_states = self.values_text_proj(text_features)
-        text_value_states = self._shape(text_value_states, -1, bsz)
+        text_value_states = self._shape(text_value_states, -1, batch_size)
 
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
 
         vision_query_states = vision_query_states.view(*proj_shape)
         text_key_states = text_key_states.view(*proj_shape)
@@ -898,9 +898,9 @@ def forward(
         src_len = text_key_states.size(1)
         attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
 
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+                f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
             )
 
         attn_weights = attn_weights - attn_weights.max()
@@ -938,23 +938,23 @@ def forward(
         vision_attn_output = torch.bmm(vision_attn_probs, text_value_states)
         text_attn_output = torch.bmm(text_attn_probs, vision_value_states)
 
-        if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
+                f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
             )
 
-        if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim):
+        if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim):
             raise ValueError(
-                f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
+                f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
             )
 
-        vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim)
         vision_attn_output = vision_attn_output.transpose(1, 2)
-        vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim)
 
-        text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim)
+        text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim)
         text_attn_output = text_attn_output.transpose(1, 2)
-        text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim)
+        text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim)
 
         vision_attn_output = self.out_vision_proj(vision_attn_output)
         text_attn_output = self.out_text_proj(text_attn_output)

From ed1176ef3eb656ef97a15bd3f104bdbb39347b1a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 11:52:27 -0300
Subject: [PATCH 122/252] Improved naming again

---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index fc85e4f212cceb..96f89505bfc1af 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -346,8 +346,8 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
-def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_clones(module, num_copies):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(num_copies)])
 
 
 def inverse_sigmoid(x, eps=1e-5):

From ef5c90fd3453f50b180df56ac7f867fff0840890 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 16:26:05 -0300
Subject: [PATCH 123/252] Improved grouding-dino.md

---
 docs/source/en/model_doc/grounding-dino.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 03c3549c32cb5f..ef41448d3d06ef 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -26,11 +26,23 @@ The abstract from the paper is the following:
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- One can use [`GroundingDINOProcessor`] to prepare image-text pairs for the model.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Grounding DINO overview. Taken from the <a href="https://arxiv.org/abs/2303.05499">original paper</a>. </small>
 
 This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
 
+
+## GroundingDINOImageProcessor
+
+[[autodoc]] GroundingDINOImageProcessor
+    - preprocess
+    - post_process_object_detection
+
 ## GroundingDINOProcessor
 
 [[autodoc]] GroundingDINOProcessor

From b2fd8687e33e84f12042b4f377c438e47840f14c Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 2 Nov 2023 16:30:37 -0300
Subject: [PATCH 124/252] Moved grounding dino to multimodal

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b80f2f093699a5..0ec9808abe4ca8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -523,8 +523,6 @@
         title: FocalNet
       - local: model_doc/glpn
         title: GLPN
-      - local: model_doc/grounding-dino
-        title: Grounding DINO
       - local: model_doc/imagegpt
         title: ImageGPT
       - local: model_doc/levit
@@ -669,6 +667,8 @@
         title: FLAVA
       - local: model_doc/git
         title: GIT
+      - local: model_doc/grounding-dino
+        title: Grounding DINO
       - local: model_doc/groupvit
         title: GroupViT
       - local: model_doc/idefics

From c23497ccba3186fb181f57800a3e89c1013355b0 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:34:26 -0300
Subject: [PATCH 125/252] Update
 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py

Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 0737b060a6e379..4a2c5eb5e21e7d 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -98,7 +98,7 @@ def create_rename_keys(state_dict, config):
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
-            # intermidiate
+            # intermediate
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
                             f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
             rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",

From a729a389db5932a68b38afccb31b28dcdfc96203 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 3 Nov 2023 11:12:01 -0300
Subject: [PATCH 126/252] Fixed docstrings and style

---
 .../configuration_grounding_dino.py           |  10 +-
 .../image_processing_grounding_dino.py        |   5 +-
 .../grounding_dino/modeling_grounding_dino.py | 122 ++++++++++++++----
 3 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index b7fee34a46b262..6ca2114bd3f560 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -304,11 +304,11 @@ def __init__(
         encoder_n_points=4,
         decoder_n_points=4,
         two_stage=True,
-        class_cost=1.,
-        bbox_cost=5.,
-        giou_cost=2.,
-        bbox_loss_coefficient=5.,
-        giou_loss_coefficient=2.,
+        class_cost=1.0,
+        bbox_cost=5.0,
+        giou_cost=2.0,
+        bbox_loss_coefficient=5.0,
+        giou_loss_coefficient=2.0,
         focal_alpha=0.25,
         disable_custom_kernels=False,
         # other parameters
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 6c9d86f5a026b5..d45ec72dd3fb79 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -778,9 +778,8 @@ class GroundingDINOImageProcessor(BaseImageProcessor):
             `do_rescale` parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
-            `preprocess` method.
-            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
-            `preprocess` method.
+            `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
             Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 96f89505bfc1af..a7cfad6db54e0e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -67,6 +67,7 @@
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
+
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
@@ -595,6 +596,7 @@ def build_position_encoding(config):
 
     return position_embedding
 
+
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
     value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
@@ -796,10 +798,31 @@ def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Ten
 
     def forward(
         self,
-        hidden_states: Tensor,
-        attention_masks: Optional[Tensor] = None,
-        position_embeddings: Optional[Tensor] = None,
-    ):  # repeat attn mask
+        hidden_states: torch.FloatTensor,
+        attention_masks: Optional[torch.BoolTensor] = None,
+        position_embeddings: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Text self-attention to enhance projection of text features generated by
+        the text encoder (GroundingDINOTextPrenet) within GroundingDINOEncoderLayer
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
+                Text features generated by the text encoder.
+            attention_masks (`torch.BoolTensor`, *optional*):
+                Attention mask for text self-attention. False for real tokens and True for padding tokens.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings to be added to the hidden states.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising two elements:
+            - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) --
+                Output of the text self-attention layer.
+            - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length,
+              sequence_length)`) --
+                Attention weights of the text self-attention layer.
+        """
+
+        # repeat attn mask
         if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # bs, num_q, num_k
             attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
@@ -858,21 +881,37 @@ def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
 
     def forward(
         self,
-        vision_features: Tensor,
-        text_features: Tensor,
-        vision_attention_mask: Optional[Tensor] = None,
-        text_attention_mask: Optional[Tensor] = None,
-    ):
-        """_summary_
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        vision_attention_mask: Optional[torch.BoolTensor] = None,
+        text_attention_mask: Optional[torch.BoolTensor] = None,
+    ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image-to-text and text-to-image cross-attention
 
         Args:
-            vision_features Tensor: bs, n_img, dim
-            text_features Tensor: bs, n_text, dim
-            vision_attention_mask (Tensor, optional): _description_. bs, n_img
-            text_attention_mask (Tensor, optional): _description_. bs, n_text
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            vision_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            text_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
 
         Returns:
-            _type_: _description_
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention
+            output and weights:
+            - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`)
+              --
+                Output of the image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) --
+                Output of the text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
         """
         batch_size, tgt_len, _ = vision_features.size()
 
@@ -905,17 +944,13 @@ def forward(
 
         attn_weights = attn_weights - attn_weights.max()
         # Do not increase -50000/50000, data type half has quite limited range
-        attn_weights = torch.clamp(
-            attn_weights, min=-50000, max=50000
-        ) 
+        attn_weights = torch.clamp(attn_weights, min=-50000, max=50000)
 
         attn_weights_transposed = attn_weights.transpose(1, 2)
         text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0]
 
         # Do not increase -50000/50000, data type half has quite limited range
-        text_attn_weights = torch.clamp(
-            text_attn_weights, min=-50000, max=50000
-        )
+        text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000)
 
         # mask vision for language
         if vision_attention_mask is not None:
@@ -1013,7 +1048,39 @@ def __init__(self, config, init_values=1e-4):
         self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
         self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
-    def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None):
+    def forward(
+        self,
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        attention_mask_vision: Optional[torch.BoolTensor] = None,
+        attention_mask_text: Optional[torch.BoolTensor] = None,
+    ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image and text features fusion
+
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            attention_mask_vision (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            attention_mask_text (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
+
+        Returns:
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced
+            feature and attention output and weights:
+            - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) --
+                Updated vision features with attention output from image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) --
+                Updated text features with attention output from text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
+        """
         vision_features = self.layer_norm_vision(vision_features)
         text_features = self.layer_norm_text(text_features)
         (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
@@ -1932,9 +1999,12 @@ def custom_forward(*inputs):
 def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
     """Generate attention mask between each pair of special tokens and positional ids.
     Args:
-        input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token]
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
     Returns:
-        `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids
+        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
+        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
+        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
     """
     batch_size, num_token = input_ids.shape
     # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
@@ -2105,7 +2175,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
         proposals = []
         current_position = 0
         for level, (height, width) in enumerate(spatial_shapes):
-            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(batch_size, height, width, 1)
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(
+                batch_size, height, width, 1
+            )
             valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
             valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 

From aafcc34eb9550eda7e0126b07083b10e318127e4 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 12:04:24 +0100
Subject: [PATCH 127/252] Fix docstrings

---
 .../grounding_dino/configuration_grounding_dino.py     | 10 +++++-----
 .../grounding_dino/image_processing_grounding_dino.py  |  4 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 6ca2114bd3f560..16da12a7eaf676 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -220,15 +220,15 @@ class GroundingDINOConfig(PretrainedConfig):
         two_stage (`bool`, *optional*, defaults to `True`):
             Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
             Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
-        class_cost (`float`, *optional*, defaults to 1):
+        class_cost (`float`, *optional*, defaults to 1.0):
             Relative weight of the classification error in the Hungarian matching cost.
-        bbox_cost (`float`, *optional*, defaults to 5):
+        bbox_cost (`float`, *optional*, defaults to 5.0):
             Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
-        giou_cost (`float`, *optional*, defaults to 2):
+        giou_cost (`float`, *optional*, defaults to 2.0):
             Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
-        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5.0):
             Relative weight of the L1 bounding box loss in the object detection loss.
-        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+        giou_loss_coefficient (`float`, *optional*, defaults to 2.0):
             Relative weight of the generalized IoU loss in the object detection loss.
         focal_alpha (`float`, *optional*, defaults to 0.25):
             Alpha parameter in the focal loss.
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index d45ec72dd3fb79..f415e5e1f4a57b 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -780,7 +780,9 @@ class GroundingDINOImageProcessor(BaseImageProcessor):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize`
             parameter in the `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
             Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
             channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.

From e4bad9b93e14b1726766d4cd8ac6ca70dfef0f30 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 13:41:40 +0100
Subject: [PATCH 128/252] Remove timm attributes

---
 .../configuration_grounding_dino.py           | 49 ++++---------------
 .../convert_grounding_dino_to_hf.py           | 19 ++++---
 .../grounding_dino/modeling_grounding_dino.py | 42 +++-------------
 3 files changed, 28 insertions(+), 82 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 16da12a7eaf676..8bf480e7d99705 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -160,16 +160,10 @@ class GroundingDINOConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
-            API.
-        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `{'model_type': 'swin'}`):
-            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
-            case it will default to `ResNetConfig()`.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
         text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`):
             The configuration of the text backbone model. Should be a bert-like config.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`GroundingDINOModel`] can detect in a single image.
@@ -202,15 +196,6 @@ class GroundingDINOConfig(PretrainedConfig):
             Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
-        backbone (`str`, *optional*, defaults to `"swin"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
-        dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
-            `use_timm_backbone` = `True`.
         num_feature_levels (`int`, *optional*, defaults to 4):
             The number of input feature levels.
         encoder_n_points (`int`, *optional*, defaults to 4):
@@ -278,10 +263,8 @@ class GroundingDINOConfig(PretrainedConfig):
 
     def __init__(
         self,
-        use_timm_backbone=False,
-        backbone_config={"model_type": "swin"},
+        backbone_config=None,
         text_backbone_config=None,
-        num_channels=3,
         num_queries=900,
         encoder_layers=6,
         encoder_ffn_dim=2048,
@@ -297,9 +280,6 @@ def __init__(
         activation_dropout=0.0,
         auxiliary_loss=False,
         position_embedding_type="sine",
-        backbone="swin",
-        use_pretrained_backbone=True,
-        dilation=False,
         num_feature_levels=4,
         encoder_n_points=4,
         decoder_n_points=4,
@@ -324,20 +304,14 @@ def __init__(
         init_std=0.02,
         **kwargs,
     ):
-        if backbone_config is not None and use_timm_backbone:
-            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
-
-        if not use_timm_backbone:
-            if backbone_config is None:
-                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
-                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
-            elif isinstance(backbone_config, dict):
-                backbone_model_type = backbone_config.get("model_type")
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
-        self.use_timm_backbone = use_timm_backbone
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
         self.backbone_config = backbone_config
-        self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -352,9 +326,6 @@ def __init__(
         self.activation_function = activation_function
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.dilation = dilation
         # deformable attributes
         self.num_feature_levels = num_feature_levels
         self.encoder_n_points = encoder_n_points
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 4a2c5eb5e21e7d..883540be9c8a03 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -29,6 +29,7 @@
     GroundingDINOForObjectDetection,
     GroundingDINOImageProcessor,
     GroundingDINOProcessor,
+    SwinConfig,
 )
 
 
@@ -37,8 +38,6 @@
 
 
 def get_grounding_dino_config(model_name):
-    config = GroundingDINOConfig()
-
     if "tiny" in model_name:
         window_size = 7
         embed_dim = 96
@@ -54,12 +53,16 @@ def get_grounding_dino_config(model_name):
     else:
         raise ValueError("Model not supported, only supports base and large variants")
 
-    config.backbone_config.window_size = window_size
-    config.backbone_config.image_size = image_size
-    config.backbone_config.embed_dim = embed_dim
-    config.backbone_config.depths = depths
-    config.backbone_config.num_heads = num_heads
-    config.backbone_config.out_indices = [2, 3, 4]
+    backbone_config = SwinConfig(
+        window_size=window_size,
+        image_size=image_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        out_indices=[2, 3, 4],
+    )
+
+    config = GroundingDINOConfig(backbone_config=backbone_config)
 
     return config
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index a7cfad6db54e0e..5abd1a8685b809 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -33,7 +33,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
-    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -120,9 +119,6 @@ def backward(context, grad_output):
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    from timm import create_model
-
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDINOConfig"
@@ -422,58 +418,34 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO
 class GroundingDINOConvEncoder(nn.Module):
     """
-    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+    Convolutional backbone using the AutoBackbone API.
 
     nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above.
-
     """
 
     def __init__(self, config):
         super().__init__()
 
         self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = AutoBackbone.from_config(config.backbone_config)
+        backbone = AutoBackbone.from_config(config.backbone_config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
+        self.intermediate_channel_sizes = self.model.channels
 
-        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        backbone_model_type = config.backbone_config.model_type
         if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if config.use_timm_backbone:
-                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
-                        parameter.requires_grad_(False)
-                else:
-                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
-                        parameter.requires_grad_(False)
+                if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                    parameter.requires_grad_(False)
 
-    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:

From e48d4118905b4ae652b3a7c635a4d7b1ffc8f453 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 13:44:31 +0100
Subject: [PATCH 129/252] Reorder imports

---
 .../grounding_dino/modeling_grounding_dino.py | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 5abd1a8685b809..7a51183dca5ccc 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -66,6 +66,19 @@
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GroundingDINOConfig"
+_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny"
+
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "idea-research/grounding-dino-tiny",
+    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
+]
+
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
 class MultiScaleDeformableAttentionFunction(Function):
@@ -116,20 +129,6 @@ def backward(context, grad_output):
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 
 
-if is_scipy_available():
-    from scipy.optimize import linear_sum_assignment
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "GroundingDINOConfig"
-_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny"
-
-GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "idea-research/grounding-dino-tiny",
-    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
-]
-
-
 @dataclass
 class GroundingDINODecoderOutput(ModelOutput):
     """

From a7f026f9af0a8dd8359ef23773c5e926945f8256 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 17:29:57 +0100
Subject: [PATCH 130/252] More improvements

---
 docs/source/en/tasks/object_detection.md      |  2 +-
 src/transformers/models/auto/modeling_auto.py |  1 -
 .../models/grounding_dino/__init__.py         | 20 ++++++++++++++++---
 .../configuration_grounding_dino.py           | 11 +++-------
 .../image_processing_grounding_dino.py        | 18 ++++++++---------
 utils/check_repo.py                           |  1 +
 6 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 58ec02e80cadf7..7511ee66dd0b99 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 45669e3ad8b4ac..5084482515a597 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -632,7 +632,6 @@
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
-        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
     ]
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 83622a84513843..67ffc2becc52c1 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
@@ -23,7 +23,6 @@
         "GroundingDINOConfig",
         "GroundingDINOTextPrenetConfig",
     ],
-    "image_processing_grounding_dino": ["GroundingDINOImageProcessor"],
     "processing_grounding_dino": ["GroundingDINOProcessor"],
 }
 
@@ -40,6 +39,14 @@
         "GroundingDINOPreTrainedModel",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_grounding_dino"] = ["GroundingDINOImageProcessor"]
+
 
 if TYPE_CHECKING:
     from .configuration_grounding_dino import (
@@ -47,7 +54,6 @@
         GroundingDINOConfig,
         GroundingDINOTextPrenetConfig,
     )
-    from .image_processing_grounding_dino import GroundingDINOImageProcessor
     from .processing_grounding_dino import GroundingDINOProcessor
 
     try:
@@ -63,6 +69,14 @@
             GroundingDINOPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_grounding_dino import GroundingDINOImageProcessor
+
 else:
     import sys
 
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 8bf480e7d99705..50bffc63377d6f 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -72,9 +72,6 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
@@ -109,7 +106,6 @@ def __init__(
         layer_norm_eps=1e-12,
         pad_token_id=0,
         position_embedding_type="absolute",
-        use_cache=True,
         init_std=0.02,
         **kwargs,
     ):
@@ -127,7 +123,6 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
         self.init_std = init_std
 
     @classmethod
@@ -162,8 +157,8 @@ class GroundingDINOConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`):
-            The configuration of the text backbone model. Should be a bert-like config.
+        text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextPrenetConfig()`):
+            The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`GroundingDINOModel`] can detect in a single image.
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index f415e5e1f4a57b..b1c92686fdde95 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -286,7 +286,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
     return masks
 
 
-# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDINO
 def prepare_coco_detection_annotation(
     image,
     target,
@@ -294,7 +294,7 @@ def prepare_coco_detection_annotation(
     input_data_format: Optional[Union[ChannelDimension, str]] = None,
 ):
     """
-    Convert the target in COCO format into the format expected by DeformableDetr.
+    Convert the target in COCO format into the format expected by GroundingDINO.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
@@ -379,7 +379,7 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDINO
 def prepare_coco_panoptic_annotation(
     image: np.ndarray,
     target: Dict,
@@ -388,7 +388,7 @@ def prepare_coco_panoptic_annotation(
     input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
-    Prepare a coco panoptic annotation for DeformableDetr.
+    Prepare a coco panoptic annotation for GroundingDINO.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
@@ -839,11 +839,11 @@ def __init__(
         self.do_pad = do_pad
 
     @classmethod
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600,
+        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
         max_size=800)`
         """
         image_processor_dict = image_processor_dict.copy()
@@ -853,7 +853,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDINO
     def prepare_annotation(
         self,
         image: np.ndarray,
@@ -864,7 +864,7 @@ def prepare_annotation(
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
-        Prepare an annotation for feeding into DeformableDetr model.
+        Prepare an annotation for feeding into GroundingDINO model.
         """
         format = format if format is not None else self.format
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 95ab142fa0b7f9..cbd2a1781f9966 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -175,6 +175,7 @@
     "CLIPSegTextModel",
     "EsmForProteinFolding",
     "GPTSanJapaneseModel",
+    "GroundingDINOForObjectDetection",
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
     "AutoformerForPrediction",

From 1930b2ac1ffb57b2eedfa13bc3edf48028683d7a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 19:32:49 +0100
Subject: [PATCH 131/252] Add Grounding DINO to pipeline

---
 src/transformers/models/auto/modeling_auto.py |  1 +
 .../configuration_grounding_dino.py           |  1 +
 ...st_pipelines_zero_shot_object_detection.py | 25 +++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 5084482515a597..cec27eab921110 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -640,6 +640,7 @@
 MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Object Detection mapping
+        ("grounding-dino", "GroundingDINOForObjectDetection"),
         ("owlv2", "Owlv2ForObjectDetection"),
         ("owlvit", "OwlViTForObjectDetection"),
     ]
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 50bffc63377d6f..474dbb44012b8f 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -236,6 +236,7 @@ class GroundingDINOConfig(PretrainedConfig):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
     Examples:
 
     ```python
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c8b424483fa20e..b03ef4285e3115 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -227,3 +227,28 @@ def test_top_k(self):
                 {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
             ],
         )
+
+    @require_torch
+    @slow
+    def test_grounding_dino(self):
+        object_detector = pipeline("zero-shot-object-detection", model="EduardoPacheco/grounding-dino-tiny")
+
+        outputs = object_detector(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            candidate_labels=["a cat."],
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.4526, "label": "a cat.", "box": {"xmin": 344, "ymin": 23, "xmax": 637, "ymax": 373}},
+                {"score": 0.4082, "label": "a cat.", "box": {"xmin": 11, "ymin": 51, "xmax": 316, "ymax": 472}},
+                {"score": 0.1617, "label": "a cat.", "box": {"xmin": 357, "ymin": 37, "xmax": 552, "ymax": 362}},
+                {"score": 0.1299, "label": "a cat.", "box": {"xmin": 330, "ymin": 13, "xmax": 635, "ymax": 445}},
+                {"score": 0.1279, "label": "a cat.", "box": {"xmin": 25, "ymin": 54, "xmax": 315, "ymax": 366}},
+                {"score": 0.1267, "label": "a cat.", "box": {"xmin": 41, "ymin": 59, "xmax": 306, "ymax": 402}},
+                {"score": 0.1098, "label": "a cat.", "box": {"xmin": 279, "ymin": 12, "xmax": 636, "ymax": 408}},
+                {"score": 0.1063, "label": "a cat.", "box": {"xmin": 353, "ymin": 39, "xmax": 616, "ymax": 297}},
+                {"score": 0.1043, "label": "a cat.", "box": {"xmin": 351, "ymin": 26, "xmax": 550, "ymax": 458}},
+            ],
+        )

From 6ac265ca6b0a241c6f23fdbbeec084d4184f5686 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 13 Nov 2023 19:53:32 +0100
Subject: [PATCH 132/252] Remove model from check_repo

---
 utils/check_repo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index cbd2a1781f9966..95ab142fa0b7f9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -175,7 +175,6 @@
     "CLIPSegTextModel",
     "EsmForProteinFolding",
     "GPTSanJapaneseModel",
-    "GroundingDINOForObjectDetection",
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
     "AutoformerForPrediction",

From 93b8609ace2b0cc82a7c9ce67c3b5f01b42f7491 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 14 Nov 2023 02:26:43 -0300
Subject: [PATCH 133/252] Added grounded post_process to GroundingDINOProcessor

---
 .../processing_grounding_dino.py              | 46 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 131eb2c600a1cd..066ccbac897bf5 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,39 @@
 Processor class for Grounding DINO.
 """
 
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Dict, Tuple
+
+import torch
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
 
+def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor):
+    """Get token ids of phrases from posmaps and input_ids.
+
+    Args:
+        posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): 
+            A boolean tensor of text-thresholded logits related to the detected bounding boxes.
+        input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): 
+            A tensor of token ids.
+
+    Returns:
+        _type_: _description_
+    """
+    left_idx = 0
+    right_idx = 255
+
+    posmaps[:, 0: left_idx + 1] = False
+    posmaps[:, right_idx:] = False
+
+    token_ids = []
+    for posmap in posmaps:
+        non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
+        token_ids.append([input_ids[i] for i in non_zero_idx])
+
+    return token_ids
 
 class GroundingDINOProcessor(ProcessorMixin):
     r"""
@@ -149,3 +175,21 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    
+    def post_process_grounded_object_detection(self, outputs, input_ids, box_threshold: float, text_threshold: float, target_sizes: Union[TensorType, List[Tuple]] = None):
+        """
+        Post-process the output of the model to get the grounded object detection results.
+        """
+        results = self.image_processor.post_process_object_detection(outputs, box_threshold, target_sizes)
+
+        probs = torch.sigmoid(outputs.logits) # (batch_size, num_queries, 256)
+
+        for idx, (result, prob) in enumerate(zip(results, probs)):
+            labels = result["labels"]
+            # Assuming that selected bboxes are sorted by confidence due to Hungarian matching loss in training
+            prob = prob[:len(labels)] # len(labels) , 256
+            token_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
+            # overrides result labels key
+            result["labels"] = self.batch_decode(token_ids)
+
+        return results
\ No newline at end of file

From 6461389afe4add98460f9581cb3d41bd5972d74a Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 14 Nov 2023 02:43:28 -0300
Subject: [PATCH 134/252] Fixed style

---
 .../processing_grounding_dino.py              | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 066ccbac897bf5..695babc3034995 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,7 +16,7 @@
 Processor class for Grounding DINO.
 """
 
-from typing import List, Optional, Union, Dict, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -25,13 +25,14 @@
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
 
+
 def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor):
     """Get token ids of phrases from posmaps and input_ids.
 
     Args:
-        posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): 
+        posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`):
             A boolean tensor of text-thresholded logits related to the detected bounding boxes.
-        input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): 
+        input_ids (`torch.LongTensor`) of shape `(sequence_length, )`):
             A tensor of token ids.
 
     Returns:
@@ -40,7 +41,7 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens
     left_idx = 0
     right_idx = 255
 
-    posmaps[:, 0: left_idx + 1] = False
+    posmaps[:, 0 : left_idx + 1] = False
     posmaps[:, right_idx:] = False
 
     token_ids = []
@@ -50,6 +51,7 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens
 
     return token_ids
 
+
 class GroundingDINOProcessor(ProcessorMixin):
     r"""
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
@@ -175,21 +177,28 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-    
-    def post_process_grounded_object_detection(self, outputs, input_ids, box_threshold: float, text_threshold: float, target_sizes: Union[TensorType, List[Tuple]] = None):
+
+    def post_process_grounded_object_detection(
+        self,
+        outputs,
+        input_ids,
+        box_threshold: float,
+        text_threshold: float,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+    ):
         """
         Post-process the output of the model to get the grounded object detection results.
         """
         results = self.image_processor.post_process_object_detection(outputs, box_threshold, target_sizes)
 
-        probs = torch.sigmoid(outputs.logits) # (batch_size, num_queries, 256)
+        probs = torch.sigmoid(outputs.logits)  # (batch_size, num_queries, 256)
 
         for idx, (result, prob) in enumerate(zip(results, probs)):
             labels = result["labels"]
             # Assuming that selected bboxes are sorted by confidence due to Hungarian matching loss in training
-            prob = prob[:len(labels)] # len(labels) , 256
+            prob = prob[: len(labels)]  # len(labels) , 256
             token_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
             # overrides result labels key
             result["labels"] = self.batch_decode(token_ids)
 
-        return results
\ No newline at end of file
+        return results

From e35f1c97191b117e5fdf40ad293d4a07b4925e63 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Tue, 14 Nov 2023 09:52:35 -0300
Subject: [PATCH 135/252] Fixed GroundingDINOTextPrenetConfig docstrings

---
 .../grounding_dino/configuration_grounding_dino.py       | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 474dbb44012b8f..efdb550e8374bc 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -78,13 +78,16 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel
+    >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOConfig, GroundingDINOForObjectDetection
 
     >>> # Initializing a BERT bert-base-uncased style configuration
     >>> configuration = GroundingDINOTextPrenetConfig()
 
-    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
-    >>> model = GroundingDINOTextPrenetModel(configuration)
+    >>> # Initializing a GroundingDINOConfig with generated bert-like config
+    >>> config = GroundingDINOConfig(text_backbone_config=configuration)
+
+    >>> # Initializing a model from the ground-up with a config
+    >>> model = GroundingDINOForObjectDetection(config)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config

From 695ffa5a00458975d8e99e08b84eb0adcf08a4ab Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 16 Nov 2023 14:23:46 -0300
Subject: [PATCH 136/252] Aligned inputs.keys() when both image and text are
 passed with model_input_names

---
 .../models/grounding_dino/processing_grounding_dino.py        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 695babc3034995..fa8a09b8e36c6e 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -151,7 +151,9 @@ def __call__(
             text_encoding = None
 
         if text_encoding is not None:
-            encoding_image_processor.update(text_encoding)
+            # Keeping same order of model_input_names when both images and text
+            text_encoding.update(encoding_image_processor)
+            encoding_image_processor = text_encoding
 
         return encoding_image_processor
 

From 7d16d7f684acc95380f8a94703d6c81e055ebafc Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 16 Nov 2023 14:24:18 -0300
Subject: [PATCH 137/252] Added tests for GroundingDINOImageProcessor and
 GroundingDINOProcessor

---
 .../test_image_processing_grounding_dino.py   | 202 +++++++++++++++++
 .../test_processor_grounding_dino.py          | 212 ++++++++++++++++++
 2 files changed, 414 insertions(+)
 create mode 100644 tests/models/grounding_dino/test_image_processing_grounding_dino.py
 create mode 100644 tests/models/grounding_dino/test_processor_grounding_dino.py

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
new file mode 100644
index 00000000000000..17bbc140de2fc3
--- /dev/null
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import GroundingDINOImageProcessor
+
+
+class GroundingDINOImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to GroundingDINOImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = GroundingDINOImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.do_pad, False)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = GroundingDINOImageProcessor()
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
new file mode 100644
index 00000000000000..9231cd8f167350
--- /dev/null
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -0,0 +1,212 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import BertTokenizer, BertTokenizerFast
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor
+
+
+@require_vision
+class GroundingDINOProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        image_processor_map = {
+            "do_resize": True,
+            "size": None,
+            "do_normalize": True,
+            "image_mean": [0.5, 0.5, 0.5],
+            "image_std": [0.5, 0.5, 0.5],
+            "do_rescale": True,
+            "rescale_factor": 1 / 255,
+            "do_pad": True,
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+
+        processor_slow = GroundingDINOProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+        processor_slow.save_pretrained(self.tmpdirname)
+        processor_slow = GroundingDINOProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+
+        processor_fast = GroundingDINOProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast.save_pretrained(self.tmpdirname)
+        processor_fast = GroundingDINOProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
+
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = GroundingDINOProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()), ["pixel_values", "pixel_mask", "input_ids", "token_type_ids", "attention_mask"]
+        )
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)

From 98321e30286c35483d4f581a44ca228eadfb3bbb Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Thu, 16 Nov 2023 14:25:36 -0300
Subject: [PATCH 138/252] Testing post_process_grounded_object_detection from
 GroundingDINOProcessor at test_inference_object_detection_head

---
 .../grounding_dino/test_modeling_grounding_dino.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 220f1a6231ec9c..cb2af2109cdaef 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -727,6 +727,20 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
         self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
 
+        # verify grounded postprocessing
+        expected_labels = ["a cat", "a cat"]
+        results = processor.post_process_grounded_object_detection(
+            outputs=outputs,
+            input_ids=encoding.input_ids,
+            box_threshold=0.35,
+            text_threshold=0.3,
+            target_sizes=[image.size[::-1]],
+        )[0]
+
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
+        self.assertListEqual(results["labels"], expected_labels)
+
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         processor = self.default_processor

From 3da62df955f2be581f76bef06920554717982ab8 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 17 Nov 2023 21:52:45 -0300
Subject: [PATCH 139/252] Fixed order

---
 tests/models/grounding_dino/test_processor_grounding_dino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 9231cd8f167350..cc7e5b9ba31c5d 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -178,7 +178,7 @@ def test_processor(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(
-            list(inputs.keys()), ["pixel_values", "pixel_mask", "input_ids", "token_type_ids", "attention_mask"]
+            list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"]
         )
 
         # test if it raises when no input is passed

From 6be9a6801a4bf7069e93e9d6562e2d87e208a3d9 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 18 Nov 2023 16:55:30 -0300
Subject: [PATCH 140/252] Marked test with require_torch

---
 .../grounding_dino/test_processor_grounding_dino.py      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index cc7e5b9ba31c5d..2d7ddfb95e395b 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -23,16 +23,19 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.testing_utils import require_vision, require_torch
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available, is_torch_available
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from PIL import Image
 
     from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor
 
-
+@require_torch
 @require_vision
 class GroundingDINOProcessorTest(unittest.TestCase):
     def setUp(self):

From cc1ee6078223c9b78825d8820b76cdd4c436c0d7 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 18 Nov 2023 16:59:26 -0300
Subject: [PATCH 141/252] Temporarily changed repo_id

---
 .../grounding_dino/configuration_grounding_dino.py   |  8 ++++----
 .../models/grounding_dino/modeling_grounding_dino.py | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index efdb550e8374bc..96cf21765f23c7 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -24,7 +24,7 @@
 logger = logging.get_logger(__name__)
 
 GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json",
+    "EduardoPacheco/grounding-dino-tiny": "https://huggingface.co/EduardoPacheco/grounding-dino-tiny/resolve/main/config.json",
 }
 
 
@@ -152,7 +152,7 @@ class GroundingDINOConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a
     Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
-    [idea-research/grounding-dino-tiny](https://huggingface.co/idea-research/grounding-dino-tiny) architecture.
+    [EduardoPacheco/grounding-dino-tiny](https://huggingface.co/EduardoPacheco/grounding-dino-tiny) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -245,10 +245,10 @@ class GroundingDINOConfig(PretrainedConfig):
     ```python
     >>> from transformers import GroundingDINOConfig, GroundingDINOModel
 
-    >>> # Initializing a Grounding DINO idea-research/grounding-dino-tiny style configuration
+    >>> # Initializing a Grounding DINO EduardoPacheco/grounding-dino-tiny style configuration
     >>> configuration = GroundingDINOConfig()
 
-    >>> # Initializing a model (with random weights) from the idea-research/grounding-dino-tiny style configuration
+    >>> # Initializing a model (with random weights) from the EduardoPacheco/grounding-dino-tiny style configuration
     >>> model = GroundingDINOModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 7a51183dca5ccc..19a9e343933d90 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -72,10 +72,10 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDINOConfig"
-_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny"
+_CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny"
 
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "idea-research/grounding-dino-tiny",
+    "EduardoPacheco/grounding-dino-tiny",
     # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
 ]
 
@@ -2207,8 +2207,8 @@ def forward(
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> text = "a cat."
 
-        >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny")
-        >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2496,8 +2496,8 @@ def forward(
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> text = "a cat."
 
-        >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny")
-        >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny")
+        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)

From 8cf167eb85f7cac2ef6a3185867adf9f0078d091 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Sat, 18 Nov 2023 17:55:21 -0300
Subject: [PATCH 142/252] More improvements

---
 .../models/grounding_dino/processing_grounding_dino.py   | 3 ---
 .../grounding_dino/test_processor_grounding_dino.py      | 9 +++++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index fa8a09b8e36c6e..fbf619a271a768 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -34,9 +34,6 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens
             A boolean tensor of text-thresholded logits related to the detected bounding boxes.
         input_ids (`torch.LongTensor`) of shape `(sequence_length, )`):
             A tensor of token ids.
-
-    Returns:
-        _type_: _description_
     """
     left_idx = 0
     right_idx = 255
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 2d7ddfb95e395b..411110de90b849 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -23,17 +23,18 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision, require_torch
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available, is_torch_available
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
 
 if is_torch_available():
-    import torch
+    from transformers import GroundingDINOProcessor
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor
+    from transformers import GroundingDINOImageProcessor
+
 
 @require_torch
 @require_vision

From 2927c130a144868b66dc7538a479022293a8fefe Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 20 Nov 2023 14:39:08 +0100
Subject: [PATCH 143/252] Fix style

---
 .../configuration_grounding_dino.py           |  2 +
 .../image_processing_grounding_dino.py        | 12 +++---
 .../grounding_dino/modeling_grounding_dino.py | 37 +++++--------------
 .../processing_grounding_dino.py              |  5 ++-
 4 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 96cf21765f23c7..0f49dc4be95a4d 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -92,6 +92,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "grounding-dino-text-prenet"
 
     def __init__(
@@ -254,6 +255,7 @@ class GroundingDINOConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "grounding-dino"
     attribute_map = {
         "hidden_size": "d_model",
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index b1c92686fdde95..251289f7add757 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -135,9 +135,9 @@ def get_resize_output_image_size(
     image size is computed by keeping the aspect ratio of the input image size.
 
     Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or `List[int]`):
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
@@ -1350,8 +1350,8 @@ def post_process_object_detection(
         self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
     ):
         """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format.
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
 
         Args:
             outputs ([`GroundingDINOObjectDetectionOutput`]):
@@ -1389,7 +1389,7 @@ def post_process_object_detection(
             else:
                 img_h, img_w = target_sizes.unbind(1)
 
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
         results = []
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 19a9e343933d90..2aeff26ad8ecbf 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -406,10 +406,11 @@ def replace_batch_norm(model):
         if isinstance(module, nn.BatchNorm2d):
             new_module = GroundingDINOFrozenBatchNorm2d(module.num_features)
 
-            new_module.weight.data.copy_(module.weight)
-            new_module.bias.data.copy_(module.bias)
-            new_module.running_mean.data.copy_(module.running_mean)
-            new_module.running_var.data.copy_(module.running_var)
+            if not module.weight.device == torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
 
             model._modules[name] = new_module
 
@@ -476,21 +477,6 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
-    """
-    batch_size, source_len = mask.size()
-    target_len = target_len if target_len is not None else source_len
-
-    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
-
-
 class GroundingDINOSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
@@ -3516,20 +3502,15 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
                     hidden_states,
                     attention_mask,
                     layer_head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
                 )
             else:
                 layer_outputs = layer_module(
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index fbf619a271a768..00c1e864faf06a 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -64,6 +64,7 @@ class GroundingDINOProcessor(ProcessorMixin):
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
+
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "GroundingDINOImageProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -165,8 +166,8 @@ def batch_decode(self, *args, **kwargs):
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
-        to the docstring of this method for more information.
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
 

From 42ee6bcae87cffb4d05352def77dbde27434e92d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Mon, 20 Nov 2023 10:41:39 -0300
Subject: [PATCH 144/252] Final improvements

---
 .../configuration_grounding_dino.py           |  2 +
 .../image_processing_grounding_dino.py        | 12 ++---
 .../grounding_dino/modeling_grounding_dino.py | 37 ++++---------
 .../processing_grounding_dino.py              |  5 +-
 .../test_image_processing_grounding_dino.py   | 52 +++++++++++++++++++
 .../test_processor_grounding_dino.py          | 11 ++++
 6 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 96cf21765f23c7..0f49dc4be95a4d 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -92,6 +92,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "grounding-dino-text-prenet"
 
     def __init__(
@@ -254,6 +255,7 @@ class GroundingDINOConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "grounding-dino"
     attribute_map = {
         "hidden_size": "d_model",
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index b1c92686fdde95..251289f7add757 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -135,9 +135,9 @@ def get_resize_output_image_size(
     image size is computed by keeping the aspect ratio of the input image size.
 
     Args:
-        image_size (`Tuple[int, int]`):
-            The input image size.
-        size (`int`):
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or `List[int]`):
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
@@ -1350,8 +1350,8 @@ def post_process_object_detection(
         self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
     ):
         """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format.
+        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
 
         Args:
             outputs ([`GroundingDINOObjectDetectionOutput`]):
@@ -1389,7 +1389,7 @@ def post_process_object_detection(
             else:
                 img_h, img_w = target_sizes.unbind(1)
 
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
         results = []
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 19a9e343933d90..2aeff26ad8ecbf 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -406,10 +406,11 @@ def replace_batch_norm(model):
         if isinstance(module, nn.BatchNorm2d):
             new_module = GroundingDINOFrozenBatchNorm2d(module.num_features)
 
-            new_module.weight.data.copy_(module.weight)
-            new_module.bias.data.copy_(module.bias)
-            new_module.running_mean.data.copy_(module.running_mean)
-            new_module.running_var.data.copy_(module.running_var)
+            if not module.weight.device == torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
 
             model._modules[name] = new_module
 
@@ -476,21 +477,6 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-# Copied from transformers.models.detr.modeling_detr._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
-    """
-    batch_size, source_len = mask.size()
-    target_len = target_len if target_len is not None else source_len
-
-    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
-
-
 class GroundingDINOSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
@@ -3516,20 +3502,15 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
                     hidden_states,
                     attention_mask,
                     layer_head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
                 )
             else:
                 layer_outputs = layer_module(
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index fbf619a271a768..00c1e864faf06a 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -64,6 +64,7 @@ class GroundingDINOProcessor(ProcessorMixin):
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
+
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "GroundingDINOImageProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -165,8 +166,8 @@ def batch_decode(self, *args, **kwargs):
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
-        to the docstring of this method for more information.
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
 
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 17bbc140de2fc3..3c24b9cedd4340 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -15,6 +15,7 @@
 
 
 import json
+import pathlib
 import unittest
 
 from transformers.testing_utils import require_torch, require_vision, slow
@@ -32,6 +33,7 @@
     from transformers import GroundingDINOImageProcessor
 
 
+# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDINO
 class GroundingDINOImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
@@ -126,6 +128,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
+# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO
 class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None
 
@@ -200,3 +203,52 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         # verify size
         expected_size = torch.tensor([800, 1066])
         self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = GroundingDINOImageProcessor(format="coco_panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 411110de90b849..b48350e0099ea7 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -77,18 +77,23 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_rust_tokenizer with CLIP->Bert
     def get_rust_tokenizer(self, **kwargs):
         return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDINO
     def get_image_processor(self, **kwargs):
         return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.tearDown
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.prepare_image_inputs
     def prepare_image_inputs(self):
         """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
         or a list of PyTorch tensors if one specifies torchify=True.
@@ -100,6 +105,7 @@ def prepare_image_inputs(self):
 
         return image_inputs
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer
     def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
@@ -124,6 +130,7 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor)
         self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer
     def test_save_load_pretrained_additional_features(self):
         processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
         processor.save_pretrained(self.tmpdirname)
@@ -141,6 +148,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDINO
     def test_image_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
@@ -155,6 +163,7 @@ def test_image_processor(self):
         for key in input_image_proc.keys():
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDINO
     def test_tokenizer(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
@@ -189,6 +198,7 @@ def test_processor(self):
         with pytest.raises(ValueError):
             processor()
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDINO
     def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
@@ -202,6 +212,7 @@ def test_tokenizer_decode(self):
 
         self.assertListEqual(decoded_tok, decoded_processor)
 
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDINO
     def test_model_input_names(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()

From e2b48b0ed8190971b1a3bd85d8f7eb97f87f2ea7 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 23 Nov 2023 13:06:06 +0100
Subject: [PATCH 145/252] Improve annotators

---
 .../test_image_processing_grounding_dino.py            |  2 +-
 .../grounding_dino/test_processor_grounding_dino.py    | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 3c24b9cedd4340..cca1233e6d7bc6 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -126,7 +126,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
         )
 
 
-@require_torch
 @require_vision
 # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO
 class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
@@ -161,6 +160,7 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.do_pad, False)
 
     @slow
+    @require_torch
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index b48350e0099ea7..7b658d8724dd68 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -21,22 +21,18 @@
 import numpy as np
 import pytest
 
-from transformers import BertTokenizer, BertTokenizerFast
+from transformers import BertTokenizer, BertTokenizerFast, GroundingDINOProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
 
-if is_torch_available():
-    from transformers import GroundingDINOProcessor
-
 if is_vision_available():
     from PIL import Image
 
     from transformers import GroundingDINOImageProcessor
 
 
-@require_torch
 @require_vision
 class GroundingDINOProcessorTest(unittest.TestCase):
     def setUp(self):

From 5e1f0d97518c7422f1025ea9f2bef893b1bf1e84 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 23 Nov 2023 13:40:33 +0100
Subject: [PATCH 146/252] Fix style

---
 .../grounding_dino/test_image_processing_grounding_dino.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index cca1233e6d7bc6..3c24b9cedd4340 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -126,6 +126,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
         )
 
 
+@require_torch
 @require_vision
 # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO
 class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
@@ -160,7 +161,6 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.do_pad, False)
 
     @slow
-    @require_torch
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")

From c9a84403b7ee7d29f11dee80fa6dae9a19d77084 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 23 Nov 2023 14:22:35 +0100
Subject: [PATCH 147/252] Add is_torch_available

---
 .../models/grounding_dino/processing_grounding_dino.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 00c1e864faf06a..1164f6541f5fcd 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -18,12 +18,14 @@
 
 from typing import List, Optional, Tuple, Union
 
-import torch
-
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import TensorType, is_torch_available
+
+
+if is_torch_available():
+    import torch
 
 
 def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor):

From f954f4bdb388969b48aa14e9100e45eed85d2b89 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 23 Nov 2023 15:06:35 +0100
Subject: [PATCH 148/252] Remove type hints

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 1164f6541f5fcd..ac3d44eaa758b4 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -28,7 +28,7 @@
     import torch
 
 
-def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor):
+def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
 
     Args:

From 2eb2a98274ed889176db066df58f17c7a8c525b2 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 13:31:32 -0300
Subject: [PATCH 149/252] vocab_tokens as one liner

---
 .../test_processor_grounding_dino.py           | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 7b658d8724dd68..13a133c80cc861 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -38,23 +38,7 @@ class GroundingDINOProcessorTest(unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
+        vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"]  # fmt: skip
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

From 625123a087d804b4260fb17d68180f8ee87c34db Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 13:33:30 -0300
Subject: [PATCH 150/252] Removed print statements

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py  | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 883540be9c8a03..f70a71c1d6a741 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -358,8 +358,6 @@ def convert_grounding_dino_checkpoint(args):
     model = GroundingDINOForObjectDetection(config)
     model.eval()
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
 
     # Load and process test image
     image = prepare_img()
@@ -379,8 +377,6 @@ def convert_grounding_dino_checkpoint(args):
     with torch.no_grad():
         outputs = model(**inputs)
 
-    print("First values of logits:", outputs.logits[0, :3, :3])
-    print("First values of boxes:", outputs.pred_boxes[0, :3, :3])
 
     # verify outputs
     expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]])
@@ -389,15 +385,12 @@ def convert_grounding_dino_checkpoint(args):
     )
     assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
     assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3)
-    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
         processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
         model.push_to_hub(f"EduardoPacheco/{model_name}")
         processor.push_to_hub(f"EduardoPacheco/{model_name}")
 

From 4553ad1695cc41c6a613c8272c7faa1462f1988d Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 13:36:53 -0300
Subject: [PATCH 151/252] Renamed GroundingDINOTextPrenetConfig to
 GroundingDINOTextConfig

---
 docs/source/en/model_doc/grounding-dino.md       |  4 ++--
 src/transformers/__init__.py                     |  4 ++--
 .../models/grounding_dino/__init__.py            |  4 ++--
 .../configuration_grounding_dino.py              | 16 ++++++++--------
 .../grounding_dino/modeling_grounding_dino.py    |  4 ++--
 .../test_modeling_grounding_dino.py              |  4 ++--
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index ef41448d3d06ef..bb95255d28014b 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -47,9 +47,9 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 
 [[autodoc]] GroundingDINOProcessor
 
-## GroundingDINOTextPrenetConfig
+## GroundingDINOTextConfig
 
-[[autodoc]] GroundingDINOTextPrenetConfig
+[[autodoc]] GroundingDINOTextConfig
 
 ## GroundingDINOConfig
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0ec3644d249a18..79b8fd62edaa00 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -374,7 +374,7 @@
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroundingDINOConfig",
         "GroundingDINOProcessor",
-        "GroundingDINOTextPrenetConfig",
+        "GroundingDINOTextConfig",
     ],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -4603,7 +4603,7 @@
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroundingDINOConfig,
         GroundingDINOProcessor,
-        GroundingDINOTextPrenetConfig,
+        GroundingDINOTextConfig,
     )
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 67ffc2becc52c1..b5db32c0f8ae47 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -21,7 +21,7 @@
     "configuration_grounding_dino": [
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroundingDINOConfig",
-        "GroundingDINOTextPrenetConfig",
+        "GroundingDINOTextConfig",
     ],
     "processing_grounding_dino": ["GroundingDINOProcessor"],
 }
@@ -52,7 +52,7 @@
     from .configuration_grounding_dino import (
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroundingDINOConfig,
-        GroundingDINOTextPrenetConfig,
+        GroundingDINOTextConfig,
     )
     from .processing_grounding_dino import GroundingDINOProcessor
 
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0f49dc4be95a4d..264ba34faf20b6 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -28,7 +28,7 @@
 }
 
 
-class GroundingDINOTextPrenetConfig(PretrainedConfig):
+class GroundingDINOTextConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to
     instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
@@ -78,10 +78,10 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOConfig, GroundingDINOForObjectDetection
+    >>> from transformers import GroundingDINOTextConfig, GroundingDINOConfig, GroundingDINOForObjectDetection
 
     >>> # Initializing a BERT bert-base-uncased style configuration
-    >>> configuration = GroundingDINOTextPrenetConfig()
+    >>> configuration = GroundingDINOTextConfig()
 
     >>> # Initializing a GroundingDINOConfig with generated bert-like config
     >>> config = GroundingDINOConfig(text_backbone_config=configuration)
@@ -161,7 +161,7 @@ class GroundingDINOConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextPrenetConfig()`):
+        text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
@@ -343,14 +343,14 @@ def __init__(
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
         if text_backbone_config is None:
-            self.text_backbone_config = GroundingDINOTextPrenetConfig()
+            self.text_backbone_config = GroundingDINOTextConfig()
         elif isinstance(text_backbone_config, dict):
-            self.text_backbone_config = GroundingDINOTextPrenetConfig(**text_backbone_config)
-        elif isinstance(text_backbone_config, GroundingDINOTextPrenetConfig):
+            self.text_backbone_config = GroundingDINOTextConfig(**text_backbone_config)
+        elif isinstance(text_backbone_config, GroundingDINOTextConfig):
             self.text_backbone_config = text_backbone_config
         else:
             raise ValueError(
-                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextPrenetConfig`. Received {type(text_backbone_config)} instead."
+                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextConfig`. Received {type(text_backbone_config)} instead."
             )
         self.max_text_len = max_text_len
         # Text Enhancer
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2aeff26ad8ecbf..5741acbd7e7d2f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -46,7 +46,7 @@
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
-from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig
+from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextConfig
 from .load_custom import load_cuda_kernels
 
 
@@ -3572,7 +3572,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
-    config_class = GroundingDINOTextPrenetConfig
+    config_class = GroundingDINOTextConfig
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index cb2af2109cdaef..f8fc49fd3754ea 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -23,7 +23,7 @@
 
 from transformers import (
     GroundingDINOConfig,
-    GroundingDINOTextPrenetConfig,
+    GroundingDINOTextConfig,
     SwinConfig,
     is_torch_available,
     is_vision_available,
@@ -146,7 +146,7 @@ def get_config(self):
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
-        text_backbone = GroundingDINOTextPrenetConfig(
+        text_backbone = GroundingDINOTextConfig(
             hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8
         )
         return GroundingDINOConfig(

From 3b6b2c2479c543aa3d95c437b487e6ebb52022cb Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 13:38:32 -0300
Subject: [PATCH 152/252] remove unnecessary comments

---
 .../models/grounding_dino/convert_grounding_dino_to_hf.py        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index f70a71c1d6a741..04a7772fef19cb 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -70,7 +70,6 @@ def get_grounding_dino_config(model_name):
 def create_rename_keys(state_dict, config):
     rename_keys = []
     # fmt: off
-    #TODO names might change after modifing GroundingDINOModel class
     ########################################## VISION BACKBONE - START
     # patch embedding layer
     rename_keys.append(("backbone.0.patch_embed.proj.weight",

From afb26499a15f4d7ea68d271697f5348131811971 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 13:41:58 -0300
Subject: [PATCH 153/252] Removed unnecessary tests on conversion script

---
 .../grounding_dino/convert_grounding_dino_to_hf.py    | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 04a7772fef19cb..b075e43969dd85 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -374,16 +374,7 @@ def convert_grounding_dino_checkpoint(args):
 
     # Running forward
     with torch.no_grad():
-        outputs = model(**inputs)
-
-
-    # verify outputs
-    expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]])
-    expected_logits = torch.tensor(
-        [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
-    )
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3)
+        _ = model(**inputs)
 
     if pytorch_dump_folder_path is not None:
         model.save_pretrained(pytorch_dump_folder_path)

From 4fdaf425ac7e8b3b5741bd00bfe0e6952a910900 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 14:34:20 -0300
Subject: [PATCH 154/252] Renamed GroundingDINO to camel case GroundingDino

---
 docs/source/en/model_doc/grounding-dino.md    |  26 +-
 src/transformers/__init__.py                  |  28 +-
 .../models/auto/configuration_auto.py         |   2 +-
 .../models/auto/image_processing_auto.py      |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../models/grounding_dino/__init__.py         |  28 +-
 .../configuration_grounding_dino.py           |  40 +--
 .../convert_grounding_dino_to_hf.py           |  26 +-
 .../image_processing_grounding_dino.py        |  24 +-
 .../grounding_dino/modeling_grounding_dino.py | 274 +++++++++---------
 .../processing_grounding_dino.py              |   8 +-
 src/transformers/utils/dummy_pt_objects.py    |   6 +-
 .../utils/dummy_vision_objects.py             |   2 +-
 .../test_image_processing_grounding_dino.py   |  20 +-
 .../test_modeling_grounding_dino.py           |  44 +--
 .../test_processor_grounding_dino.py          |  50 ++--
 utils/check_repo.py                           |   2 +-
 17 files changed, 293 insertions(+), 293 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index bb95255d28014b..f3ccc78ad5c876 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 Tips:
 
-- One can use [`GroundingDINOProcessor`] to prepare image-text pairs for the model.
+- One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model.
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
 alt="drawing" width="600"/>
@@ -37,30 +37,30 @@ This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPac
 The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
 
 
-## GroundingDINOImageProcessor
+## GroundingDinoImageProcessor
 
-[[autodoc]] GroundingDINOImageProcessor
+[[autodoc]] GroundingDinoImageProcessor
     - preprocess
     - post_process_object_detection
 
-## GroundingDINOProcessor
+## GroundingDinoProcessor
 
-[[autodoc]] GroundingDINOProcessor
+[[autodoc]] GroundingDinoProcessor
 
-## GroundingDINOTextConfig
+## GroundingDinoTextConfig
 
-[[autodoc]] GroundingDINOTextConfig
+[[autodoc]] GroundingDinoTextConfig
 
-## GroundingDINOConfig
+## GroundingDinoConfig
 
-[[autodoc]] GroundingDINOConfig
+[[autodoc]] GroundingDinoConfig
 
-## GroundingDINOModel
+## GroundingDinoModel
 
-[[autodoc]] GroundingDINOModel
+[[autodoc]] GroundingDinoModel
     - forward
 
-## GroundingDINOForObjectDetection
+## GroundingDinoForObjectDetection
 
-[[autodoc]] GroundingDINOForObjectDetection
+[[autodoc]] GroundingDinoForObjectDetection
     - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 79b8fd62edaa00..ac41ec69ca30b6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -372,9 +372,9 @@
     "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
     "models.grounding_dino": [
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "GroundingDINOConfig",
-        "GroundingDINOProcessor",
-        "GroundingDINOTextConfig",
+        "GroundingDinoConfig",
+        "GroundingDinoProcessor",
+        "GroundingDinoTextConfig",
     ],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -994,7 +994,7 @@
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
-    _import_structure["models.grounding_dino"].extend(["GroundingDINOImageProcessor"])
+    _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
     _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
@@ -2001,9 +2001,9 @@
     _import_structure["models.grounding_dino"].extend(
         [
             "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "GroundingDINOForObjectDetection",
-            "GroundingDINOModel",
-            "GroundingDINOPreTrainedModel",
+            "GroundingDinoForObjectDetection",
+            "GroundingDinoModel",
+            "GroundingDinoPreTrainedModel",
         ]
     )
     _import_structure["models.groupvit"].extend(
@@ -4601,9 +4601,9 @@
     from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
     from .models.grounding_dino import (
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        GroundingDINOConfig,
-        GroundingDINOProcessor,
-        GroundingDINOTextConfig,
+        GroundingDinoConfig,
+        GroundingDinoProcessor,
+        GroundingDinoTextConfig,
     )
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5164,7 +5164,7 @@
         from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
         from .models.fuyu import FuyuImageProcessor, FuyuProcessor
         from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
-        from .models.grounding_dino import GroundingDINOImageProcessor
+        from .models.grounding_dino import GroundingDinoImageProcessor
         from .models.idefics import IdeficsImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
@@ -6010,9 +6010,9 @@
         )
         from .models.grounding_dino import (
             GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            GroundingDINOForObjectDetection,
-            GroundingDINOModel,
-            GroundingDINOPreTrainedModel,
+            GroundingDinoForObjectDetection,
+            GroundingDinoModel,
+            GroundingDinoPreTrainedModel,
         )
         from .models.groupvit import (
             GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5ededa2c191d09..01060b3dc3f31b 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -110,7 +110,7 @@
         ("gptj", "GPTJConfig"),
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("graphormer", "GraphormerConfig"),
-        ("grounding-dino", "GroundingDINOConfig"),
+        ("grounding-dino", "GroundingDinoConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 99a8cc0387b18a..da4c75f558060f 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -67,7 +67,7 @@
         ("fuyu", "FuyuImageProcessor"),
         ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
-        ("grounding-dino", "GroundingDINOImageProcessor"),
+        ("grounding-dino", "GroundingDinoImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 27c9b4ce094424..c64e2ab050ee51 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -106,7 +106,7 @@
         ("gptj", "GPTJModel"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("graphormer", "GraphormerModel"),
-        ("grounding-dino", "GroundingDINOModel"),
+        ("grounding-dino", "GroundingDinoModel"),
         ("groupvit", "GroupViTModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
@@ -648,7 +648,7 @@
 MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Object Detection mapping
-        ("grounding-dino", "GroundingDINOForObjectDetection"),
+        ("grounding-dino", "GroundingDinoForObjectDetection"),
         ("owlv2", "Owlv2ForObjectDetection"),
         ("owlvit", "OwlViTForObjectDetection"),
     ]
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index b5db32c0f8ae47..6dfe21cf83d5e0 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -20,10 +20,10 @@
 _import_structure = {
     "configuration_grounding_dino": [
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "GroundingDINOConfig",
-        "GroundingDINOTextConfig",
+        "GroundingDinoConfig",
+        "GroundingDinoTextConfig",
     ],
-    "processing_grounding_dino": ["GroundingDINOProcessor"],
+    "processing_grounding_dino": ["GroundingDinoProcessor"],
 }
 
 try:
@@ -34,9 +34,9 @@
 else:
     _import_structure["modeling_grounding_dino"] = [
         "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "GroundingDINOForObjectDetection",
-        "GroundingDINOModel",
-        "GroundingDINOPreTrainedModel",
+        "GroundingDinoForObjectDetection",
+        "GroundingDinoModel",
+        "GroundingDinoPreTrainedModel",
     ]
 
 try:
@@ -45,16 +45,16 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["image_processing_grounding_dino"] = ["GroundingDINOImageProcessor"]
+    _import_structure["image_processing_grounding_dino"] = ["GroundingDinoImageProcessor"]
 
 
 if TYPE_CHECKING:
     from .configuration_grounding_dino import (
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        GroundingDINOConfig,
-        GroundingDINOTextConfig,
+        GroundingDinoConfig,
+        GroundingDinoTextConfig,
     )
-    from .processing_grounding_dino import GroundingDINOProcessor
+    from .processing_grounding_dino import GroundingDinoProcessor
 
     try:
         if not is_torch_available():
@@ -64,9 +64,9 @@
     else:
         from .modeling_grounding_dino import (
             GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST,
-            GroundingDINOForObjectDetection,
-            GroundingDINOModel,
-            GroundingDINOPreTrainedModel,
+            GroundingDinoForObjectDetection,
+            GroundingDinoModel,
+            GroundingDinoPreTrainedModel,
         )
 
     try:
@@ -75,7 +75,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .image_processing_grounding_dino import GroundingDINOImageProcessor
+        from .image_processing_grounding_dino import GroundingDinoImageProcessor
 
 else:
     import sys
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 264ba34faf20b6..e7091ba2b695d7 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -28,9 +28,9 @@
 }
 
 
-class GroundingDINOTextConfig(PretrainedConfig):
+class GroundingDinoTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to
+    This is the configuration class to store the configuration of a [`GroundingDinoTextPrenetModel`]. It is used to
     instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BERT
     [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
@@ -41,7 +41,7 @@ class GroundingDINOTextConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`].
+            `inputs_ids` passed when calling [`GroundingDinoTextPrenetModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -61,7 +61,7 @@ class GroundingDINOTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDinoTextPrenetModel`].
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         pad_token_id (`int`, *optional*, defaults to 0):
@@ -78,16 +78,16 @@ class GroundingDINOTextConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import GroundingDINOTextConfig, GroundingDINOConfig, GroundingDINOForObjectDetection
+    >>> from transformers import GroundingDinoTextConfig, GroundingDinoConfig, GroundingDinoForObjectDetection
 
     >>> # Initializing a BERT bert-base-uncased style configuration
-    >>> configuration = GroundingDINOTextConfig()
+    >>> configuration = GroundingDinoTextConfig()
 
-    >>> # Initializing a GroundingDINOConfig with generated bert-like config
-    >>> config = GroundingDINOConfig(text_backbone_config=configuration)
+    >>> # Initializing a GroundingDinoConfig with generated bert-like config
+    >>> config = GroundingDinoConfig(text_backbone_config=configuration)
 
     >>> # Initializing a model from the ground-up with a config
-    >>> model = GroundingDINOForObjectDetection(config)
+    >>> model = GroundingDinoForObjectDetection(config)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -148,9 +148,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-class GroundingDINOConfig(PretrainedConfig):
+class GroundingDinoConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a
     Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
     [EduardoPacheco/grounding-dino-tiny](https://huggingface.co/EduardoPacheco/grounding-dino-tiny) architecture.
@@ -161,11 +161,11 @@ class GroundingDINOConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextConfig()`):
+        text_backbone_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            [`GroundingDINOModel`] can detect in a single image.
+            [`GroundingDinoModel`] can detect in a single image.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         encoder_ffn_dim (`int`, *optional*, defaults to 2048):
@@ -244,13 +244,13 @@ class GroundingDINOConfig(PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import GroundingDINOConfig, GroundingDINOModel
+    >>> from transformers import GroundingDinoConfig, GroundingDinoModel
 
     >>> # Initializing a Grounding DINO EduardoPacheco/grounding-dino-tiny style configuration
-    >>> configuration = GroundingDINOConfig()
+    >>> configuration = GroundingDinoConfig()
 
     >>> # Initializing a model (with random weights) from the EduardoPacheco/grounding-dino-tiny style configuration
-    >>> model = GroundingDINOModel(configuration)
+    >>> model = GroundingDinoModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -343,14 +343,14 @@ def __init__(
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
         if text_backbone_config is None:
-            self.text_backbone_config = GroundingDINOTextConfig()
+            self.text_backbone_config = GroundingDinoTextConfig()
         elif isinstance(text_backbone_config, dict):
-            self.text_backbone_config = GroundingDINOTextConfig(**text_backbone_config)
-        elif isinstance(text_backbone_config, GroundingDINOTextConfig):
+            self.text_backbone_config = GroundingDinoTextConfig(**text_backbone_config)
+        elif isinstance(text_backbone_config, GroundingDinoTextConfig):
             self.text_backbone_config = text_backbone_config
         else:
             raise ValueError(
-                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextConfig`. Received {type(text_backbone_config)} instead."
+                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDinoTextConfig`. Received {type(text_backbone_config)} instead."
             )
         self.max_text_len = max_text_len
         # Text Enhancer
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index b075e43969dd85..066e0a209a0f53 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert GroundingDINO SimMIM checkpoints from the original repository.
+"""Convert GroundingDino SimMIM checkpoints from the original repository.
 
 URL: https://github.com/IDEA-Research/GroundingDINO"""
 
@@ -25,10 +25,10 @@
 
 from transformers import (
     AutoTokenizer,
-    GroundingDINOConfig,
-    GroundingDINOForObjectDetection,
-    GroundingDINOImageProcessor,
-    GroundingDINOProcessor,
+    GroundingDinoConfig,
+    GroundingDinoForObjectDetection,
+    GroundingDinoImageProcessor,
+    GroundingDinoProcessor,
     SwinConfig,
 )
 
@@ -62,7 +62,7 @@ def get_grounding_dino_config(model_name):
         out_indices=[2, 3, 4],
     )
 
-    config = GroundingDINOConfig(backbone_config=backbone_config)
+    config = GroundingDinoConfig(backbone_config=backbone_config)
 
     return config
 
@@ -334,10 +334,10 @@ def convert_grounding_dino_checkpoint(args):
     push_to_hub = args.push_to_hub
 
     checkpoint_mapping = {
-        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth",
-        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth",
+        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
+        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
     }
-    # Define default GroundingDINO configuation
+    # Define default GroundingDino configuation
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
@@ -354,7 +354,7 @@ def convert_grounding_dino_checkpoint(args):
     read_in_q_k_v(new_state_dict, config)
 
     # Load HF model
-    model = GroundingDINOForObjectDetection(config)
+    model = GroundingDinoForObjectDetection(config)
     model.eval()
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
 
@@ -363,9 +363,9 @@ def convert_grounding_dino_checkpoint(args):
     transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
     original_pixel_values = transforms(image).unsqueeze(0)
 
-    image_processor = GroundingDINOImageProcessor()
+    image_processor = GroundingDinoImageProcessor()
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
     text = "a cat"
     inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt")
@@ -393,7 +393,7 @@ def convert_grounding_dino_checkpoint(args):
         default="grounding-dino-tiny",
         type=str,
         choices=["grounding-dino-tiny", "grounding-dino-base"],
-        help="Name of the GroundingDINO model you'd like to convert.",
+        help="Name of the GroundingDino model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 251289f7add757..d98892922c0024 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -286,7 +286,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
     return masks
 
 
-# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDINO
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino
 def prepare_coco_detection_annotation(
     image,
     target,
@@ -294,7 +294,7 @@ def prepare_coco_detection_annotation(
     input_data_format: Optional[Union[ChannelDimension, str]] = None,
 ):
     """
-    Convert the target in COCO format into the format expected by GroundingDINO.
+    Convert the target in COCO format into the format expected by GroundingDino.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
@@ -379,7 +379,7 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
     return np.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDINO
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino
 def prepare_coco_panoptic_annotation(
     image: np.ndarray,
     target: Dict,
@@ -388,7 +388,7 @@ def prepare_coco_panoptic_annotation(
     input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
-    Prepare a coco panoptic annotation for GroundingDINO.
+    Prepare a coco panoptic annotation for GroundingDino.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
@@ -758,7 +758,7 @@ def compute_segments(
     return segmentation, segments
 
 
-class GroundingDINOImageProcessor(BaseImageProcessor):
+class GroundingDinoImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Grounding DINO image processor.
 
@@ -839,11 +839,11 @@ def __init__(
         self.do_pad = do_pad
 
     @classmethod
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600,
+        created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600,
         max_size=800)`
         """
         image_processor_dict = image_processor_dict.copy()
@@ -853,7 +853,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDINO
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino
     def prepare_annotation(
         self,
         image: np.ndarray,
@@ -864,7 +864,7 @@ def prepare_annotation(
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
-        Prepare an annotation for feeding into GroundingDINO model.
+        Prepare an annotation for feeding into GroundingDino model.
         """
         format = format if format is not None else self.format
 
@@ -1345,16 +1345,16 @@ def preprocess(
 
         return encoded_inputs
 
-    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDINO
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
     def post_process_object_detection(
         self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
     ):
         """
-        Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
         bottom_right_x, bottom_right_y) format.
 
         Args:
-            outputs ([`GroundingDINOObjectDetectionOutput`]):
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
                 Raw outputs of the model.
             threshold (`float`, *optional*):
                 Score threshold to keep object detection predictions.
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 5741acbd7e7d2f..664f549603b6e3 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -46,7 +46,7 @@
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
-from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextConfig
+from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
 from .load_custom import load_cuda_kernels
 
 
@@ -71,7 +71,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "GroundingDINOConfig"
+_CONFIG_FOR_DOC = "GroundingDinoConfig"
 _CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny"
 
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -130,9 +130,9 @@ def backward(context, grad_output):
 
 
 @dataclass
-class GroundingDINODecoderOutput(ModelOutput):
+class GroundingDinoDecoderOutput(ModelOutput):
     """
-    Base class for outputs of the GroundingDINODecoder. This class adds two attributes to
+    Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to
     BaseModelOutputWithCrossAttentions, namely:
     - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
     - a stacked tensor of intermediate reference points.
@@ -162,9 +162,9 @@ class GroundingDINODecoderOutput(ModelOutput):
 
 
 @dataclass
-class GroundingDINOEncoderOutput(ModelOutput):
+class GroundingDinoEncoderOutput(ModelOutput):
     """
-    Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to:
+    Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to:
     - vision and text last hidden states
     - vision and text intermediate hidden states
 
@@ -196,7 +196,7 @@ class GroundingDINOEncoderOutput(ModelOutput):
 
 
 @dataclass
-class GroundingDINOModelOutput(ModelOutput):
+class GroundingDinoModelOutput(ModelOutput):
     """
     Base class for outputs of the Grounding DINO encoder-decoder model.
 
@@ -259,9 +259,9 @@ class GroundingDINOModelOutput(ModelOutput):
 
 
 @dataclass
-class GroundingDINOObjectDetectionOutput(ModelOutput):
+class GroundingDinoObjectDetectionOutput(ModelOutput):
     """
-    Output type of [`GroundingDINOForObjectDetection`].
+    Output type of [`GroundingDinoForObjectDetection`].
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
@@ -275,7 +275,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the
+            possible padding). You can use [`~GroundingDinoProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`List[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -353,8 +353,8 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO
-class GroundingDINOFrozenBatchNorm2d(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
+class GroundingDinoFrozenBatchNorm2d(nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 
@@ -393,10 +393,10 @@ def forward(self, x):
         return x * scale + bias
 
 
-# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDino
 def replace_batch_norm(model):
     r"""
-    Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`.
+    Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDinoFrozenBatchNorm2d`.
 
     Args:
         model (torch.nn.Module):
@@ -404,7 +404,7 @@ def replace_batch_norm(model):
     """
     for name, module in model.named_children():
         if isinstance(module, nn.BatchNorm2d):
-            new_module = GroundingDINOFrozenBatchNorm2d(module.num_features)
+            new_module = GroundingDinoFrozenBatchNorm2d(module.num_features)
 
             if not module.weight.device == torch.device("meta"):
                 new_module.weight.data.copy_(module.weight)
@@ -418,11 +418,11 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-class GroundingDINOConvEncoder(nn.Module):
+class GroundingDinoConvEncoder(nn.Module):
     """
     Convolutional backbone using the AutoBackbone API.
 
-    nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above.
     """
 
     def __init__(self, config):
@@ -455,8 +455,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         return out
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO
-class GroundingDINOConvModel(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino
+class GroundingDinoConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
     """
@@ -477,7 +477,7 @@ def forward(self, pixel_values, pixel_mask):
         return out, pos
 
 
-class GroundingDINOSinePositionEmbedding(nn.Module):
+class GroundingDinoSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -516,7 +516,7 @@ def forward(self, pixel_values, pixel_mask):
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
-class GroundingDINOLearnedPositionEmbedding(nn.Module):
+class GroundingDinoLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -543,11 +543,11 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDINOSinePositionEmbedding(
+        position_embedding = GroundingDinoSinePositionEmbedding(
             n_steps, config.positional_embedding_temperature, normalize=True
         )
     elif config.position_embedding_type == "learned":
-        position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps)
+        position_embedding = GroundingDinoLearnedPositionEmbedding(n_steps)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 
@@ -594,13 +594,13 @@ def multi_scale_deformable_attention(
     return output.transpose(1, 2).contiguous()
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO
-class GroundingDINOMultiscaleDeformableAttention(nn.Module):
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino,Deformable DETR->Grounding DINO
+class GroundingDinoMultiscaleDeformableAttention(nn.Module):
     """
     Multiscale deformable attention as proposed in Deformable DETR.
     """
 
-    def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
+    def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
         super().__init__()
         if config.d_model % num_heads != 0:
             raise ValueError(
@@ -610,7 +610,7 @@ def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int):
         # check if dim_per_head is power of 2
         if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
             warnings.warn(
-                "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the"
+                "You'd better set embed_dim (d_model) in GroundingDinoMultiscaleDeformableAttention to make the"
                 " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
                 " implementation."
             )
@@ -728,7 +728,7 @@ def forward(
         return output, attention_weights
 
 
-class GroundingDINOTextEnhancerLayer(nn.Module):
+class GroundingDinoTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
 
     def __init__(self, config):
@@ -760,7 +760,7 @@ def forward(
         position_embeddings: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
         """Text self-attention to enhance projection of text features generated by
-        the text encoder (GroundingDINOTextPrenet) within GroundingDINOEncoderLayer
+        the text encoder (GroundingDinoTextPrenet) within GroundingDinoEncoderLayer
 
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
@@ -803,7 +803,7 @@ def forward(
         return hidden_states, attention_weights
 
 
-class GroundingDINOBiMultiHeadAttention(nn.Module):
+class GroundingDinoBiMultiHeadAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
 
@@ -975,8 +975,8 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO
-class GroundingDINODropPath(nn.Module):
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDino
+class GroundingDinoDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
     def __init__(self, drop_prob: Optional[float] = None) -> None:
@@ -990,7 +990,7 @@ def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
 
 
-class GroundingDINOFusionLayer(nn.Module):
+class GroundingDinoFusionLayer(nn.Module):
     def __init__(self, config, init_values=1e-4):
         super().__init__()
         drop_path = config.fusion_droppath
@@ -998,10 +998,10 @@ def __init__(self, config, init_values=1e-4):
         # pre layer norm
         self.layer_norm_vision = nn.LayerNorm(config.d_model)
         self.layer_norm_text = nn.LayerNorm(config.d_model)
-        self.attn = GroundingDINOBiMultiHeadAttention(config)
+        self.attn = GroundingDinoBiMultiHeadAttention(config)
 
         # add layer scale for training stability
-        self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
         self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
@@ -1053,11 +1053,11 @@ def forward(
 
 
 # NOTE just renamed the class
-class GroundingDINODeformableLayer(nn.Module):
-    def __init__(self, config: GroundingDINOConfig):
+class GroundingDinoDeformableLayer(nn.Module):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        self.self_attn = GroundingDINOMultiscaleDeformableAttention(
+        self.self_attn = GroundingDinoMultiscaleDeformableAttention(
             config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -1165,15 +1165,15 @@ def sine_func(x: torch.Tensor):
     return pos_res
 
 
-class GroundingDINOEncoderLayer(nn.Module):
+class GroundingDinoEncoderLayer(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
 
         self.d_model = config.d_model
 
-        self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config)
-        self.fusion_layer = GroundingDINOFusionLayer(config)
-        self.deformable_layer = GroundingDINODeformableLayer(config)
+        self.text_enhancer_layer = GroundingDinoTextEnhancerLayer(config)
+        self.fusion_layer = GroundingDinoFusionLayer(config)
+        self.deformable_layer = GroundingDinoDeformableLayer(config)
 
     def get_text_position_embeddings(
         self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor
@@ -1240,8 +1240,8 @@ def forward(
         )
 
 
-class GroundingDINODecoderLayer(nn.Module):
-    def __init__(self, config: GroundingDINOConfig):
+class GroundingDinoDecoderLayer(nn.Module):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -1266,7 +1266,7 @@ def __init__(self, config: GroundingDINOConfig):
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
-        self.encoder_attn = GroundingDINOMultiscaleDeformableAttention(
+        self.encoder_attn = GroundingDinoMultiscaleDeformableAttention(
             config,
             num_heads=config.decoder_attention_heads,
             n_points=config.decoder_n_points,
@@ -1358,7 +1358,7 @@ def forward(
         return outputs
 
 
-class GroundingDINOContrastiveEmbedding(nn.Module):
+class GroundingDinoContrastiveEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.max_text_len = config.max_text_len
@@ -1380,7 +1380,7 @@ def forward(
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
-class GroundingDINOClassificationHead(nn.Module):
+class GroundingDinoClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
@@ -1398,20 +1398,20 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-class GroundingDINOPreTrainedModel(PreTrainedModel):
-    config_class = GroundingDINOConfig
+class GroundingDinoPreTrainedModel(PreTrainedModel):
+    config_class = GroundingDinoConfig
     base_model_prefix = "model"
     main_input_name = "pixel_values"
 
     def _init_weights(self, module):
         std = self.config.init_std
 
-        if isinstance(module, GroundingDINOLearnedPositionEmbedding):
+        if isinstance(module, GroundingDinoLearnedPositionEmbedding):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
-        elif isinstance(module, GroundingDINOMultiscaleDeformableAttention):
+        elif isinstance(module, GroundingDinoMultiscaleDeformableAttention):
             module._reset_parameters()
-        elif isinstance(module, GroundingDINOBiMultiHeadAttention):
+        elif isinstance(module, GroundingDinoBiMultiHeadAttention):
             nn.init.xavier_uniform_(module.vision_proj.weight)
             module.vision_proj.bias.data.fill_(0)
             nn.init.xavier_uniform_(module.text_proj.weight)
@@ -1424,7 +1424,7 @@ def _init_weights(self, module):
             module.out_vision_proj.bias.data.fill_(0)
             nn.init.xavier_uniform_(module.out_text_proj.weight)
             module.out_text_proj.bias.data.fill_(0)
-        elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)):
+        elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)):
             for p in module.parameters():
                 if p.dim() > 1:
                     nn.init.normal_(p, mean=0.0, std=std)
@@ -1438,7 +1438,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, GroundingDINOMLPPredictionHead):
+        elif isinstance(module, GroundingDinoMLPPredictionHead):
             nn.init.constant_(module.layers[-1].weight.data, 0)
             nn.init.constant_(module.layers[-1].bias.data, 0)
 
@@ -1449,7 +1449,7 @@ def _init_weights(self, module):
             nn.init.normal_(module.level_embed)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GroundingDINODecoder):
+        if isinstance(module, GroundingDinoDecoder):
             module.gradient_checkpointing = value
 
 
@@ -1463,7 +1463,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`GroundingDINOConfig`]):
+        config ([`GroundingDinoConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1474,7 +1474,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] for
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for
             details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -1489,7 +1489,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDINOTokenizer.__call__`] for details.
+            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
 
         attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -1522,22 +1522,22 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-class GroundingDINOEncoder(GroundingDINOPreTrainedModel):
+class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
-    [`GroundingDINOEncoderLayer`].
+    [`GroundingDinoEncoderLayer`].
 
     The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
 
     Args:
-        config: GroundingDINOConfig
+        config: GroundingDinoConfig
     """
 
-    def __init__(self, config: GroundingDINOConfig):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
-        self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)])
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1681,7 +1681,7 @@ def forward(
         if not return_dict:
             enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
             return tuple(v for v in enc_outputs if v is not None)
-        return GroundingDINOEncoderOutput(
+        return GroundingDinoEncoderOutput(
             last_hidden_state_vision=vision_features,
             last_hidden_state_text=text_features,
             hidden_states_vision=encoder_vision_states,
@@ -1690,9 +1690,9 @@ def forward(
         )
 
 
-class GroundingDINODecoder(GroundingDINOPreTrainedModel):
+class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`].
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`].
 
     The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
@@ -1702,16 +1702,16 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel):
     - it also returns a stack of intermediate outputs and reference points from all decoding layers.
 
     Args:
-        config: GroundingDINOConfig
+        config: GroundingDinoConfig
     """
 
-    def __init__(self, config: GroundingDINOConfig):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layer_norm = nn.LayerNorm(config.d_model)
-        self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.reference_points_head = GroundingDINOMLPPredictionHead(
+        self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = GroundingDinoMLPPredictionHead(
             config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
         )
         self.gradient_checkpointing = False
@@ -1941,7 +1941,7 @@ def custom_forward(*inputs):
                 ]
                 if v is not None
             )
-        return GroundingDINODecoderOutput(
+        return GroundingDinoDecoderOutput(
             last_hidden_state=hidden_states,
             intermediate_hidden_states=intermediate,
             intermediate_reference_points=intermediate_reference_points,
@@ -1999,14 +1999,14 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-class GroundingDINOModel(GroundingDINOPreTrainedModel):
-    def __init__(self, config: GroundingDINOConfig):
+class GroundingDinoModel(GroundingDinoPreTrainedModel):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = GroundingDINOConvEncoder(config)
+        backbone = GroundingDinoConvEncoder(config)
         position_embeddings = build_position_encoding(config)
-        self.backbone = GroundingDINOConvModel(backbone, position_embeddings)
+        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
@@ -2040,14 +2040,14 @@ def __init__(self, config: GroundingDINOConfig):
             )
 
         # Create text backbone
-        self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config)
+        self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config)
         self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
             self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-        self.encoder = GroundingDINOEncoder(config)
-        self.decoder = GroundingDINODecoder(config)
+        self.encoder = GroundingDinoEncoder(config)
+        self.decoder = GroundingDinoDecoder(config)
 
         self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
 
@@ -2061,11 +2061,11 @@ def __init__(self, config: GroundingDINOConfig):
             ):
                 self.encoder_output_bbox_embed = self.decoder.bbox_embed
             else:
-                self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead(
+                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
                     input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
                 )
 
-            self.encoder_output_class_embed = GroundingDINOContrastiveEmbedding(config)
+            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
         else:
             self.reference_points = nn.Embedding(config.num_queries, 4)
 
@@ -2166,7 +2166,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
         return object_query, output_proposals
 
     @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Tensor,
@@ -2185,7 +2185,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoProcessor, GroundingDINOModel
+        >>> from transformers import AutoProcessor, GroundingDinoModel
         >>> from PIL import Image
         >>> import requests
 
@@ -2194,7 +2194,7 @@ def forward(
         >>> text = "a cat."
 
         >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2315,9 +2315,9 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput):
-            encoder_outputs = GroundingDINOEncoderOutput(
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
+            encoder_outputs = GroundingDinoEncoderOutput(
                 last_hidden_state_vision=encoder_outputs[0],
                 last_hidden_state_text=encoder_outputs[1],
                 hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
@@ -2387,7 +2387,7 @@ def forward(
 
             return tuple_outputs
 
-        return GroundingDINOModelOutput(
+        return GroundingDinoModelOutput(
             init_reference_points=init_reference_points,
             last_hidden_state=decoder_outputs.last_hidden_state,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
@@ -2411,19 +2411,19 @@ def forward(
     """,
     GROUNDING_DINO_START_DOCSTRING,
 )
-class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel):
+class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
     _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
-    def __init__(self, config: GroundingDINOConfig):
+    def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
         # Deformable DETR encoder-decoder model
-        self.model = GroundingDINOModel(config)
+        self.model = GroundingDinoModel(config)
 
         # Detection heads on top
-        _class_embed = GroundingDINOContrastiveEmbedding(config)
-        _bbox_embed = GroundingDINOMLPPredictionHead(
+        _class_embed = GroundingDinoContrastiveEmbedding(config)
+        _bbox_embed = GroundingDinoMLPPredictionHead(
             input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
 
@@ -2448,7 +2448,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
     @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -2456,7 +2456,7 @@ def forward(
         attention_mask: torch.LongTensor = None,
         token_type_ids: torch.LongTensor = None,
         pixel_mask: Optional[torch.BoolTensor] = None,
-        encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None,
+        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
         labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -2474,7 +2474,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoProcessor, GroundingDINOForObjectDetection
+        >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
         >>> from PIL import Image
         >>> import requests
 
@@ -2483,7 +2483,7 @@ def forward(
         >>> text = "a cat."
 
         >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2560,12 +2560,12 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             # First: create the matcher
-            matcher = GroundingDINOHungarianMatcher(
+            matcher = GroundingDinoHungarianMatcher(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
             losses = ["labels", "boxes", "cardinality"]
-            criterion = GroundingDINOLoss(
+            criterion = GroundingDinoLoss(
                 matcher=matcher,
                 num_classes=self.config.num_labels,
                 focal_alpha=self.config.focal_alpha,
@@ -2603,7 +2603,7 @@ def forward(
 
             return tuple_outputs
 
-        dict_outputs = GroundingDINOObjectDetectionOutput(
+        dict_outputs = GroundingDinoObjectDetectionOutput(
             loss=loss,
             loss_dict=loss_dict,
             logits=logits,
@@ -2679,15 +2679,15 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
     return loss.mean(1).sum() / num_boxes
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO
-class GroundingDINOLoss(nn.Module):
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino
+class GroundingDinoLoss(nn.Module):
     """
-    This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we
+    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
     compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
     matched ground-truth / prediction (supervise class and box).
 
     Args:
-        matcher (`GroundingDINOHungarianMatcher`):
+        matcher (`GroundingDinoHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
         num_classes (`int`):
             Number of object categories, omitting the special no-object category.
@@ -2858,7 +2858,7 @@ def forward(self, outputs, targets):
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
-class GroundingDINOMLPPredictionHead(nn.Module):
+class GroundingDinoMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
     height and width of a bounding box w.r.t. an image.
@@ -2879,8 +2879,8 @@ def forward(self, x):
         return x
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO
-class GroundingDINOHungarianMatcher(nn.Module):
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
+class GroundingDinoHungarianMatcher(nn.Module):
     """
     This class computes an assignment between the targets and the predictions of the network.
 
@@ -3078,8 +3078,8 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
     return NestedTensor(tensor, mask)
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText
-class GroundingDINOTextEmbeddings(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
+class GroundingDinoTextEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
@@ -3143,8 +3143,8 @@ def forward(
         return embeddings
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText
-class GroundingDINOTextSelfAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
+class GroundingDinoTextSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -3251,7 +3251,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -3278,8 +3278,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText
-class GroundingDINOTextSelfOutput(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
+class GroundingDinoTextSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -3293,12 +3293,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText
-class GroundingDINOTextAttention(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
+class GroundingDinoTextAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = GroundingDINOTextSelfOutput(config)
+        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = GroundingDinoTextSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -3343,8 +3343,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText
-class GroundingDINOTextIntermediate(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
+class GroundingDinoTextIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -3359,8 +3359,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText
-class GroundingDINOTextOutput(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
+class GroundingDinoTextOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -3374,21 +3374,21 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText
-class GroundingDINOTextLayer(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
+class GroundingDinoTextLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = GroundingDINOTextAttention(config)
+        self.attention = GroundingDinoTextAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = GroundingDINOTextIntermediate(config)
-        self.output = GroundingDINOTextOutput(config)
+            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = GroundingDinoTextIntermediate(config)
+        self.output = GroundingDinoTextOutput(config)
 
     def forward(
         self,
@@ -3461,12 +3461,12 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText
-class GroundingDINOTextEncoder(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
+class GroundingDinoTextEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -3555,8 +3555,8 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText
-class GroundingDINOTextPooler(nn.Module):
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText
+class GroundingDinoTextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -3571,17 +3571,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel):
-    config_class = GroundingDINOTextConfig
+class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel):
+    config_class = GroundingDinoTextConfig
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = GroundingDINOTextEmbeddings(config)
-        self.encoder = GroundingDINOTextEncoder(config)
+        self.embeddings = GroundingDinoTextEmbeddings(config)
+        self.encoder = GroundingDinoTextEncoder(config)
 
-        self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None
+        self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index ac3d44eaa758b4..20265e98c15c09 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -51,13 +51,13 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
-class GroundingDINOProcessor(ProcessorMixin):
+class GroundingDinoProcessor(ProcessorMixin):
     r"""
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
     single processor.
 
-    [`GroundingDINOProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and
-    [`AutoTokenizer`]. See the docstring of [`~GroundingDINOProcessor.__call__`] and [`~GroundingDINOProcessor.decode`]
+    [`GroundingDinoProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and
+    [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
     for more information.
 
     Args:
@@ -68,7 +68,7 @@ class GroundingDINOProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "GroundingDINOImageProcessor"
+    image_processor_class = "GroundingDinoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 686f5d7d1a11d7..9a28b7b98548e9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4043,21 +4043,21 @@ def __init__(self, *args, **kwargs):
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class GroundingDINOForObjectDetection(metaclass=DummyObject):
+class GroundingDinoForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class GroundingDINOModel(metaclass=DummyObject):
+class GroundingDinoModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class GroundingDINOPreTrainedModel(metaclass=DummyObject):
+class GroundingDinoPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 26c7a08f7b064c..352d88cf65ce44 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -247,7 +247,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class GroundingDINOImageProcessor(metaclass=DummyObject):
+class GroundingDinoImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 3c24b9cedd4340..51bd5807991458 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -30,11 +30,11 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GroundingDINOImageProcessor
+    from transformers import GroundingDinoImageProcessor
 
 
-# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDINO
-class GroundingDINOImageProcessingTester(unittest.TestCase):
+# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDino
+class GroundingDinoImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -81,7 +81,7 @@ def prepare_image_processor_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to GroundingDINOImageProcessor,
+        This function computes the expected height and width when providing images to GroundingDinoImageProcessor,
         assuming do_resize is set to True with a scalar size.
         """
         if not batched:
@@ -128,12 +128,12 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO
-class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None
+# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDino
+class GroundingDinoImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
 
     def setUp(self):
-        self.image_processor_tester = GroundingDINOImageProcessingTester(self)
+        self.image_processor_tester = GroundingDinoImageProcessingTester(self)
 
     @property
     def image_processor_dict(self):
@@ -170,7 +170,7 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         target = {"image_id": 39769, "annotations": target}
 
         # encode them
-        image_processing = GroundingDINOImageProcessor()
+        image_processing = GroundingDinoImageProcessor()
         encoding = image_processing(images=image, annotations=target, return_tensors="pt")
 
         # verify pixel values
@@ -216,7 +216,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
         # encode them
-        image_processing = GroundingDINOImageProcessor(format="coco_panoptic")
+        image_processing = GroundingDinoImageProcessor(format="coco_panoptic")
         encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
 
         # verify pixel values
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index f8fc49fd3754ea..fc41dfb3a2349c 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -22,8 +22,8 @@
 from typing import Dict, List, Tuple
 
 from transformers import (
-    GroundingDINOConfig,
-    GroundingDINOTextConfig,
+    GroundingDinoConfig,
+    GroundingDinoTextConfig,
     SwinConfig,
     is_torch_available,
     is_vision_available,
@@ -47,7 +47,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import GroundingDINOForObjectDetection, GroundingDINOModel
+    from transformers import GroundingDinoForObjectDetection, GroundingDinoModel
     from transformers.pytorch_utils import id_tensor_storage
 
 
@@ -57,7 +57,7 @@
     from transformers import AutoProcessor
 
 
-class GroundingDINOModelTester:
+class GroundingDinoModelTester:
     def __init__(
         self,
         parent,
@@ -146,10 +146,10 @@ def get_config(self):
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
-        text_backbone = GroundingDINOTextConfig(
+        text_backbone = GroundingDinoTextConfig(
             hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8
         )
-        return GroundingDINOConfig(
+        return GroundingDinoConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
@@ -176,7 +176,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels):
-        model = GroundingDINOModel(config=config)
+        model = GroundingDinoModel(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -185,7 +185,7 @@ def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, la
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
     def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels):
-        model = GroundingDINOForObjectDetection(config=config)
+        model = GroundingDinoForObjectDetection(config=config)
         model.to(torch_device)
         model.eval()
 
@@ -202,15 +202,15 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix
 
 
 @require_torch
-class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else ()
+class GroundingDinoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (GroundingDinoModel, GroundingDinoForObjectDetection) if is_torch_available() else ()
     is_encoder_decoder = True
     test_torchscript = False
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
     pipeline_model_mapping = (
-        {"feature-extraction": GroundingDINOModel, "object-detection": GroundingDINOForObjectDetection}
+        {"feature-extraction": GroundingDinoModel, "object-detection": GroundingDinoForObjectDetection}
         if is_torch_available()
         else {}
     )
@@ -220,7 +220,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ == "GroundingDINOForObjectDetection":
+            if model_class.__name__ == "GroundingDinoForObjectDetection":
                 labels = []
                 for i in range(self.model_tester.batch_size):
                     target = {}
@@ -243,8 +243,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def setUp(self):
-        self.model_tester = GroundingDINOModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False)
+        self.model_tester = GroundingDinoModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GroundingDinoConfig, has_text_modality=False)
 
     def test_config(self):
         # we don't test common_properties and arguments_init as these don't apply for Grounding DINO
@@ -325,7 +325,7 @@ def test_attention_outputs(self):
             if "labels" in inputs_dict:
                 correct_outlen += 1  # loss is added to beginning
             # Object Detection model returns pred_logits and pred_boxes
-            if model_class.__name__ == "GroundingDINOForObjectDetection":
+            if model_class.__name__ == "GroundingDinoForObjectDetection":
                 correct_outlen += 2
 
             self.assertEqual(out_len, correct_outlen)
@@ -580,7 +580,7 @@ def test_different_timm_backbone(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if model_class.__name__ == "GroundingDINOForObjectDetection":
+            if model_class.__name__ == "GroundingDinoForObjectDetection":
                 expected_shape = (
                     self.model_tester.batch_size,
                     self.model_tester.num_queries,
@@ -617,7 +617,7 @@ def test_initialization(self):
                     )
 
     def test_two_stage_training(self):
-        model_class = GroundingDINOForObjectDetection
+        model_class = GroundingDinoForObjectDetection
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
         config.two_stage = True
@@ -655,9 +655,9 @@ def test_tied_weights_keys(self):
                 for i in range(len(tied_params)):
                     tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
 
-            # GroundingDINO when sharing weights also uses the shared ones in GroundingDINODecoder
+            # GroundingDino when sharing weights also uses the shared ones in GroundingDinoDecoder
             # Therefore, differently from DeformableDetr, we expect the group lens to be 2
-            # one for self.bbox_embed in GroundingDINOForObejectDetection and another one
+            # one for self.bbox_embed in GroundingDinoForObejectDetection and another one
             # in the decoder
             tied_params = [group for group in tied_params if len(group) > 2]
             self.assertListEqual(
@@ -684,13 +684,13 @@ def prepare_text():
 @require_timm
 @require_vision
 @slow
-class GroundingDINOModelIntegrationTests(unittest.TestCase):
+class GroundingDinoModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
         return AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") if is_vision_available() else None
 
     def test_inference_object_detection_head(self):
-        model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device)
+        model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device)
 
         processor = self.default_processor
         image = prepare_img()
@@ -749,7 +749,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         encoding = processor(images=image, text=text, return_tensors="pt")
 
         # 1. run model on CPU
-        model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         with torch.no_grad():
             cpu_outputs = model(**encoding)
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 13a133c80cc861..44283bc69737e6 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -21,7 +21,7 @@
 import numpy as np
 import pytest
 
-from transformers import BertTokenizer, BertTokenizerFast, GroundingDINOProcessor
+from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
@@ -30,11 +30,11 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GroundingDINOImageProcessor
+    from transformers import GroundingDinoImageProcessor
 
 
 @require_vision
-class GroundingDINOProcessorTest(unittest.TestCase):
+class GroundingDinoProcessorTest(unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
@@ -65,9 +65,9 @@ def get_tokenizer(self, **kwargs):
     def get_rust_tokenizer(self, **kwargs):
         return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDINO
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDino
     def get_image_processor(self, **kwargs):
-        return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+        return GroundingDinoImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.tearDown
     def tearDown(self):
@@ -85,19 +85,19 @@ def prepare_image_inputs(self):
 
         return image_inputs
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer
     def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
 
-        processor_slow = GroundingDINOProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+        processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
         processor_slow.save_pretrained(self.tmpdirname)
-        processor_slow = GroundingDINOProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor_slow = GroundingDinoProcessor.from_pretrained(self.tmpdirname, use_fast=False)
 
-        processor_fast = GroundingDINOProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast = GroundingDinoProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
         processor_fast.save_pretrained(self.tmpdirname)
-        processor_fast = GroundingDINOProcessor.from_pretrained(self.tmpdirname)
+        processor_fast = GroundingDinoProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
         self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
@@ -107,18 +107,18 @@ def test_save_load_pretrained_default(self):
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
         self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor)
+        self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor)
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer
     def test_save_load_pretrained_additional_features(self):
-        processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor = GroundingDinoProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
         image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
 
-        processor = GroundingDINOProcessor.from_pretrained(
+        processor = GroundingDinoProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
@@ -126,14 +126,14 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor)
+        self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor)
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDINO
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino
     def test_image_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         image_input = self.prepare_image_inputs()
 
@@ -143,12 +143,12 @@ def test_image_processor(self):
         for key in input_image_proc.keys():
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDINO
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino
     def test_tokenizer(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
 
@@ -163,7 +163,7 @@ def test_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -178,12 +178,12 @@ def test_processor(self):
         with pytest.raises(ValueError):
             processor()
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDINO
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDino
     def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
 
@@ -192,12 +192,12 @@ def test_tokenizer_decode(self):
 
         self.assertListEqual(decoded_tok, decoded_processor)
 
-    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDINO
+    # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDino
     def test_model_input_names(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 5ecf0aa9a7bf07..798d89e8ca6890 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -232,7 +232,7 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
-    "GroundingDINOTextPrenet",
+    "GroundingDinoTextPrenet",
     "InstructBlipVisionModel",
     "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",

From 559de31855fc0b78d1cc8d5b8a47cd5d65349b16 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 14:35:57 -0300
Subject: [PATCH 155/252] Fixed GroundingDinoProcessor docstrings

---
 .../models/grounding_dino/processing_grounding_dino.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 20265e98c15c09..0e658a42f77baa 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -56,13 +56,13 @@ class GroundingDinoProcessor(ProcessorMixin):
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
     single processor.
 
-    [`GroundingDinoProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and
+    [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and
     [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
     for more information.
 
     Args:
-        image_processor (`DeformableDetrImageProcessor`):
-            An instance of [`DeformableDetrImageProcessor`]. The image processor is a required input.
+        image_processor (`GroundingDinoImageProcessor`):
+            An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input.
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
@@ -95,7 +95,7 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method uses [`DeformableDetrImageProcessor.__call__`] method to prepare image(s) for the model, and
+        This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.

From fef983e21c1a28bdecbf3000cf2fe222f32152c4 Mon Sep 17 00:00:00 2001
From: EduardoPach <eduardo.pach@hotmail.com>
Date: Fri, 8 Dec 2023 14:47:03 -0300
Subject: [PATCH 156/252] loading MSDA kernels in the modeling file

---
 .../models/grounding_dino/load_custom.py      | 49 -------------------
 .../grounding_dino/modeling_grounding_dino.py | 37 +++++++++++++-
 2 files changed, 35 insertions(+), 51 deletions(-)
 delete mode 100644 src/transformers/models/grounding_dino/load_custom.py

diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py
deleted file mode 100644
index 97b8f09fb5f446..00000000000000
--- a/src/transformers/models/grounding_dino/load_custom.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Loading of Grounding DINO's CUDA kernels"""
-import os
-from pathlib import Path
-
-
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
-
-    import MultiScaleDeformableAttention as MSDA
-
-    return MSDA
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 664f549603b6e3..0e2e6a4baf23cd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -14,11 +14,12 @@
 # limitations under the License.
 """ PyTorch Grounding DINO model."""
 
-
 import copy
 import math
+import os
 import warnings
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -47,11 +48,43 @@
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
-from .load_custom import load_cuda_kernels
 
 
 logger = logging.get_logger(__name__)
 
+
+def load_cuda_kernels():
+    from torch.utils.cpp_extension import load
+
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
+    src_files = [
+        root / filename
+        for filename in [
+            "vision.cpp",
+            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+        ]
+    ]
+
+    load(
+        "MultiScaleDeformableAttention",
+        src_files,
+        with_cuda=True,
+        extra_include_paths=[str(root)],
+        extra_cflags=["-DWITH_CUDA=1"],
+        extra_cuda_cflags=[
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ],
+    )
+
+    import MultiScaleDeformableAttention as MSDA
+
+    return MSDA
+
+
 # Move this to not compile only when importing, this needs to happen later, like in __init__.
 if is_torch_cuda_available() and is_ninja_available():
     logger.info("Loading custom CUDA kernels...")

From 9994ee0162eeb3a38bb520dfca03991e98a5a62a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 11 Dec 2023 11:09:24 +0100
Subject: [PATCH 157/252] Fix copies

---
 .../models/grounding_dino/image_processing_grounding_dino.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index d98892922c0024..4565b744b0a774 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -331,10 +331,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]

From 14c839dfc8de22b02e29218a4e616a8403af09b6 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 31 Jan 2024 22:35:36 +0100
Subject: [PATCH 158/252] Replace nn.multiheadattention

---
 .../convert_grounding_dino_to_hf.py           |  39 ++++++-
 .../grounding_dino/modeling_grounding_dino.py | 110 ++++++++++++++++--
 2 files changed, 138 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 066e0a209a0f53..5b7290bdfd3184 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -280,7 +280,7 @@ def rename_key(dct, old, new):
 
 
 # we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
+def read_in_q_k_v_encoder(state_dict, config):
     ########################################## VISION BACKBONE - START
     embed_dim = config.backbone_config.embed_dim
     for layer, depth in enumerate(config.backbone_config.depths):
@@ -313,6 +313,25 @@ def read_in_q_k_v(state_dict, config):
     ########################################## VISION BACKBONE - END
 
 
+def read_in_q_k_v_decoder(state_dict, config):
+    hidden_size = config.hidden_size
+    for idx in range(config.decoder_layers):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
+
+        state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
+
+        state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
+
+
 # We will verify our results on an image of cute cats
 def prepare_img():
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -345,18 +364,24 @@ def convert_grounding_dino_checkpoint(args):
     original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
     original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
 
+    for name, param in original_state_dict.items():
+        print(name, param.shape)
+
     # Rename keys
     new_state_dict = original_state_dict.copy()
     rename_keys = create_rename_keys(original_state_dict, config)
 
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
-    read_in_q_k_v(new_state_dict, config)
+    read_in_q_k_v_encoder(new_state_dict, config)
+    read_in_q_k_v_decoder(new_state_dict, config)
 
     # Load HF model
     model = GroundingDinoForObjectDetection(config)
     model.eval()
     missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
 
     # Load and process test image
     image = prepare_img()
@@ -374,7 +399,15 @@ def convert_grounding_dino_checkpoint(args):
 
     # Running forward
     with torch.no_grad():
-        _ = model(**inputs)
+        outputs = model(**inputs)
+
+    print(outputs.logits[0, :3, :3])
+
+    expected_slice = torch.tensor(
+        [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
+    )
+
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
 
     if pytorch_dump_folder_path is not None:
         model.save_pretrained(pytorch_dump_folder_path)
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 0e2e6a4baf23cd..56a7f9c55114dc 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1273,18 +1273,104 @@ def forward(
         )
 
 
+class GroundingDinoMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with batch_first=True."""
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(queries)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        else:
+            key_layer = self.transpose_for_scores(self.key(keys))
+            value_layer = self.transpose_for_scores(self.value(values))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
 class GroundingDinoDecoderLayer(nn.Module):
     def __init__(self, config: GroundingDinoConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
         # self-attention
-        self.self_attn = nn.MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            batch_first=True,
-        )
+        mha_config = copy.deepcopy(config)
+        mha_config.num_attention_heads = config.decoder_attention_heads
+        self.self_attn = GroundingDinoMultiheadAttention(mha_config)
+        # self.self_attn = nn.MultiheadAttention(
+        #     embed_dim=self.embed_dim,
+        #     num_heads=config.decoder_attention_heads,
+        #     dropout=config.attention_dropout,
+        #     batch_first=True,
+        # )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
@@ -1330,10 +1416,18 @@ def forward(
         residual = hidden_states
 
         # Self Attention
-        q = k = self.with_pos_embed(hidden_states, position_embeddings)
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, self_attn_weights = self.self_attn(
-            query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=self_attn_mask,
+            output_attentions=True,
         )
+        # q = k = self.with_pos_embed(hidden_states, position_embeddings)
+        # hidden_states, self_attn_weights = self.self_attn(
+        #     query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False
+        # )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states

From 5a6f2583f9eee532b51c475bfd160eb001fea3cc Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 1 Feb 2024 09:09:38 +0100
Subject: [PATCH 159/252] Replace nn.multiheadattention

---
 .../convert_grounding_dino_to_hf.py           | 40 +++++++++++---
 .../grounding_dino/modeling_grounding_dino.py | 52 +++++--------------
 2 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 5b7290bdfd3184..3d9b7673fbef38 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -331,6 +331,24 @@ def read_in_q_k_v_decoder(state_dict, config):
         state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :]
         state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
 
+        # read in weights + bias of cross-attention
+        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias")
+
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size]
+
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[
+            hidden_size : hidden_size * 2
+        ]
+
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:]
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
@@ -351,6 +369,7 @@ def convert_grounding_dino_checkpoint(args):
     model_name = args.model_name
     pytorch_dump_folder_path = args.pytorch_dump_folder_path
     push_to_hub = args.push_to_hub
+    verify_logits = args.verify_logits
 
     checkpoint_mapping = {
         "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
@@ -397,17 +416,19 @@ def convert_grounding_dino_checkpoint(args):
 
     assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4)
 
-    # Running forward
-    with torch.no_grad():
-        outputs = model(**inputs)
+    if verify_logits:
+        # Running forward
+        with torch.no_grad():
+            outputs = model(**inputs)
 
-    print(outputs.logits[0, :3, :3])
+        print(outputs.logits[0, :3, :3])
 
-    expected_slice = torch.tensor(
-        [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
-    )
+        expected_slice = torch.tensor(
+            [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
+        )
 
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
+        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
+        print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         model.save_pretrained(pytorch_dump_folder_path)
@@ -434,6 +455,9 @@ def convert_grounding_dino_checkpoint(args):
     parser.add_argument(
         "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
+    parser.add_argument(
+        "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
+    )
 
     args = parser.parse_args()
     convert_grounding_dino_checkpoint(args)
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 56a7f9c55114dc..f99ec9ab2c9717 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1274,7 +1274,7 @@ def forward(
 
 
 class GroundingDinoMultiheadAttention(nn.Module):
-    """Equivalent implementation of nn.MultiheadAttention with batch_first=True."""
+    """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
 
     def __init__(self, config):
         super().__init__()
@@ -1307,26 +1307,11 @@ def forward(
         keys: torch.Tensor,
         values: torch.Tensor,
         attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(queries)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        else:
-            key_layer = self.transpose_for_scores(self.key(keys))
-            value_layer = self.transpose_for_scores(self.value(values))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(queries))
+        key_layer = self.transpose_for_scores(self.key(keys))
+        value_layer = self.transpose_for_scores(self.value(values))
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
@@ -1365,24 +1350,14 @@ def __init__(self, config: GroundingDinoConfig):
         mha_config = copy.deepcopy(config)
         mha_config.num_attention_heads = config.decoder_attention_heads
         self.self_attn = GroundingDinoMultiheadAttention(mha_config)
-        # self.self_attn = nn.MultiheadAttention(
-        #     embed_dim=self.embed_dim,
-        #     num_heads=config.decoder_attention_heads,
-        #     dropout=config.attention_dropout,
-        #     batch_first=True,
-        # )
+
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention text
-        self.encoder_attn_text = nn.MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            batch_first=True,
-        )
+        self.encoder_attn_text = GroundingDinoMultiheadAttention(mha_config)
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
         self.encoder_attn = GroundingDinoMultiscaleDeformableAttention(
@@ -1424,10 +1399,6 @@ def forward(
             attention_mask=self_attn_mask,
             output_attentions=True,
         )
-        # q = k = self.with_pos_embed(hidden_states, position_embeddings)
-        # hidden_states, self_attn_weights = self.self_attn(
-        #     query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False
-        # )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
@@ -1436,12 +1407,13 @@ def forward(
         second_residual = hidden_states
 
         # Cross-Attention Text
+        queries = self.with_pos_embed(hidden_states, position_embeddings)
         hidden_states, text_cross_attn_weights = self.encoder_attn_text(
-            query=self.with_pos_embed(hidden_states, position_embeddings),
-            key=text_encoder_hidden_states,
-            value=text_encoder_hidden_states,
-            key_padding_mask=text_encoder_attention_mask,
-            average_attn_weights=False,
+            queries=queries,
+            keys=text_encoder_hidden_states,
+            values=text_encoder_hidden_states,
+            attention_mask=text_encoder_attention_mask,
+            output_attentions=True,
         )
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

From 9fa83da7f00c2769d9111a8c9f3062810e4a43d7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sun, 4 Feb 2024 01:17:41 +0100
Subject: [PATCH 160/252] Fixed inputs for GroundingDinoMultiheadAttention &
 order of modules

---
 .../grounding_dino/modeling_grounding_dino.py | 3096 ++++++++---------
 1 file changed, 1544 insertions(+), 1552 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 56a7f9c55114dc..7edaa5dbbfb827 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -761,16 +761,97 @@ def forward(
         return output, attention_weights
 
 
+class GroundingDinoMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with batch_first=True."""
+
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({embed_dim}) is not a multiple of the number of attention " f"heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(embed_dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(embed_dim, self.all_head_size)
+        self.key = nn.Linear(embed_dim, self.all_head_size)
+        self.value = nn.Linear(embed_dim, self.all_head_size)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(queries)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        else:
+            key_layer = self.transpose_for_scores(self.key(keys))
+            value_layer = self.transpose_for_scores(self.value(values))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
 class GroundingDinoTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
 
     def __init__(self, config):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(
+        self.self_attn = GroundingDinoMultiheadAttention(
             embed_dim=config.d_model,
             num_heads=config.encoder_attention_heads // 2,
             dropout=config.text_enhancer_dropout,
-            batch_first=True,
         )
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
@@ -1085,7 +1166,6 @@ def forward(
         return (vision_features, vision_attn), (text_features, text_attn)
 
 
-# NOTE just renamed the class
 class GroundingDinoDeformableLayer(nn.Module):
     def __init__(self, config: GroundingDinoConfig):
         super().__init__()
@@ -1273,115 +1353,27 @@ def forward(
         )
 
 
-class GroundingDinoMultiheadAttention(nn.Module):
-    """Equivalent implementation of nn.MultiheadAttention with batch_first=True."""
-
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
-
-        self.dropout = nn.Dropout(config.attention_dropout)
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        queries: torch.Tensor,
-        keys: torch.Tensor,
-        values: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(queries)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        else:
-            key_layer = self.transpose_for_scores(self.key(keys))
-            value_layer = self.transpose_for_scores(self.value(values))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        context_layer = self.out_proj(context_layer)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
 class GroundingDinoDecoderLayer(nn.Module):
     def __init__(self, config: GroundingDinoConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
         # self-attention
-        mha_config = copy.deepcopy(config)
-        mha_config.num_attention_heads = config.decoder_attention_heads
-        self.self_attn = GroundingDinoMultiheadAttention(mha_config)
-        # self.self_attn = nn.MultiheadAttention(
-        #     embed_dim=self.embed_dim,
-        #     num_heads=config.decoder_attention_heads,
-        #     dropout=config.attention_dropout,
-        #     batch_first=True,
-        # )
+        self.self_attn = GroundingDinoMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention text
-        self.encoder_attn_text = nn.MultiheadAttention(
+        self.encoder_attn_text = GroundingDinoMultiheadAttention(
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            batch_first=True,
         )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
@@ -2077,1732 +2069,1732 @@ def custom_forward(*inputs):
         )
 
 
-SPECIAL_TOKENS = [101, 102, 1012, 1029]
 
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
+class GroundingDinoTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
 
-def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
-    """Generate attention mask between each pair of special tokens and positional ids.
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-    Returns:
-        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
-        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
-        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
-    """
-    batch_size, num_token = input_ids.shape
-    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
-    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
-    for special_token in SPECIAL_TOKENS:
-        special_tokens_mask |= input_ids == special_token
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
-    # idxs: each row is a list of indices of special tokens
-    idxs = torch.nonzero(special_tokens_mask)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
 
-    # generate attention mask and positional ids
-    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
-    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
-    previous_col = 0
-    for i in range(idxs.shape[0]):
-        row, col = idxs[i]
-        if (col == 0) or (col == num_token - 1):
-            attention_mask[row, col, col] = True
-            position_ids[row, col] = 0
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
         else:
-            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
-            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
-                0, col - previous_col, device=input_ids.device
-            )
-
-        previous_col = col
+            input_shape = inputs_embeds.size()[:-1]
 
-    return attention_mask, position_ids.to(torch.long)
+        seq_length = input_shape[1]
 
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
-@add_start_docstrings(
-    """
-    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
-    hidden-states without any specific head on top.
-    """,
-    GROUNDING_DINO_START_DOCSTRING,
-)
-class GroundingDinoModel(GroundingDinoPreTrainedModel):
-    def __init__(self, config: GroundingDinoConfig):
-        super().__init__(config)
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
-        # Create backbone + positional encoding
-        backbone = GroundingDinoConvEncoder(config)
-        position_embeddings = build_position_encoding(config)
-        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        # Create input projection layers
-        if config.num_feature_levels > 1:
-            num_backbone_outs = len(backbone.intermediate_channel_sizes)
-            input_proj_list = []
-            for _ in range(num_backbone_outs):
-                in_channels = backbone.intermediate_channel_sizes[_]
-                input_proj_list.append(
-                    nn.Sequential(
-                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                )
-            for _ in range(config.num_feature_levels - num_backbone_outs):
-                input_proj_list.append(
-                    nn.Sequential(
-                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                )
-                in_channels = config.d_model
-            self.input_proj_vision = nn.ModuleList(input_proj_list)
-        else:
-            self.input_proj_vision = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                ]
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
+class GroundingDinoTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
             )
 
-        # Create text backbone
-        self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config)
-        self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        if config.embedding_init_target or not config.two_stage:
-            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
-        self.encoder = GroundingDinoEncoder(config)
-        self.decoder = GroundingDinoDecoder(config)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
 
-        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+        self.is_decoder = config.is_decoder
 
-        if config.two_stage:
-            self.enc_output = nn.Linear(config.d_model, config.d_model)
-            self.enc_output_norm = nn.LayerNorm(config.d_model)
-            if (
-                config.two_stage_bbox_embed_share
-                and config.decoder_bbox_embed_share
-                and self.decoder.bbox_embed is not None
-            ):
-                self.encoder_output_bbox_embed = self.decoder.bbox_embed
-            else:
-                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
-                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
-                )
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
 
-            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
-        else:
-            self.reference_points = nn.Embedding(config.num_queries, 4)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
 
-        self.post_init()
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
 
-    def get_encoder(self):
-        return self.encoder
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
 
-    def get_decoder(self):
-        return self.decoder
+        query_layer = self.transpose_for_scores(mixed_query_layer)
 
-    def freeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
-            param.requires_grad_(False)
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
 
-    def unfreeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
-            param.requires_grad_(True)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-    def get_valid_ratio(self, mask):
-        """Get the valid ratio of all feature maps."""
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
 
-        _, height, width = mask.shape
-        valid_height = torch.sum(mask[:, :, 0], 1)
-        valid_width = torch.sum(mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.float() / height
-        valid_ratio_width = valid_width.float() / width
-        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
-        return valid_ratio
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
 
-    def get_proposal_pos_embed(self, proposals):
-        """Get the position embedding of the proposals."""
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-        num_pos_feats = self.config.d_model // 2
-        temperature = 10000
-        scale = 2 * math.pi
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
 
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
-        # batch_size, num_queries, 4
-        proposals = proposals.sigmoid() * scale
-        # batch_size, num_queries, 4, 128
-        pos = proposals[:, :, :, None] / dim_t
-        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
-        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
-        return pos
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
-    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
-        """Generate the encoder output proposals from encoded enc_output.
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
 
-        Args:
-            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
-            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
-            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
 
-        Returns:
-            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
-                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
-                  directly predict a bounding box. (without the need of a decoder)
-                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
-                  sigmoid.
-        """
-        batch_size = enc_output.shape[0]
-        proposals = []
-        current_position = 0
-        for level, (height, width) in enumerate(spatial_shapes):
-            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(
-                batch_size, height, width, 1
-            )
-            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
-            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+        context_layer = torch.matmul(attention_probs, value_layer)
 
-            grid_y, grid_x = meshgrid(
-                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
-                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
-                indexing="ij",
-            )
-            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
 
-            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
-            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
-            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
-            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
-            proposals.append(proposal)
-            current_position += height * width
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        output_proposals = torch.cat(proposals, 1)
-        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
-        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
-        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
-        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
 
-        # assign each pixel as an object query
-        object_query = enc_output
-        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
-        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
-        object_query = self.enc_output_norm(self.enc_output(object_query))
-        return object_query, output_proposals
 
-    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        pixel_values: Tensor,
-        input_ids: Tensor,
-        token_type_ids: Tensor = None,
-        attention_mask: Tensor = None,
-        pixel_mask: Optional[Tensor] = None,
-        encoder_outputs=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
+class GroundingDinoTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        Examples:
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
-        ```python
-        >>> from transformers import AutoProcessor, GroundingDinoModel
-        >>> from PIL import Image
-        >>> import requests
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> text = "a cat."
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
+class GroundingDinoTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = GroundingDinoTextSelfOutput(config)
+        self.pruned_heads = set()
 
-        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
 
-        >>> inputs = processor(images=image, text=text, return_tensors="pt")
-        >>> outputs = model(**inputs)
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
 
-        >>> last_hidden_states = outputs.last_hidden_state
-        >>> list(last_hidden_states.shape)
-        [1, 900, 256]
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
 
-        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
 
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
+class GroundingDinoTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
 
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
 
-        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
 
-        max_text_len = self.config.max_text_len
-        if text_self_attention_masks.shape[1] > max_text_len:
-            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-            position_ids = position_ids[:, :max_text_len]
-            input_ids = input_ids[:, :max_text_len]
-            token_type_ids = token_type_ids[:, :max_text_len]
-            text_token_mask = text_token_mask[:, :max_text_len]
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
+class GroundingDinoTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
-            "last_hidden_state"
-        ]
-        text_features = self.text_projection(text_features)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
-        batch_size, num_channels, height, width = pixel_values.shape
-        device = pixel_values.device
 
-        if pixel_mask is None:
-            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
+class GroundingDinoTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GroundingDinoTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = GroundingDinoTextIntermediate(config)
+        self.output = GroundingDinoTextOutput(config)
 
-        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
-        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
-        # which is a list of tuples
-        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
 
-        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
-        sources = []
-        masks = []
-        for level, (source, mask) in enumerate(vision_features):
-            sources.append(self.input_proj_vision[level](source))
-            masks.append(mask)
-            if mask is None:
-                raise ValueError("No attention mask was provided")
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
-        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
-        if self.config.num_feature_levels > len(sources):
-            _len_sources = len(sources)
-            for level in range(_len_sources, self.config.num_feature_levels):
-                if level == _len_sources:
-                    source = self.input_proj_vision[level](vision_features[-1][0])
-                else:
-                    source = self.input_proj_vision[level](sources[-1])
-                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
-                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
-                sources.append(source)
-                masks.append(mask)
-                position_embeddings_list.append(pos_l)
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
 
-        # Create queries
-        query_embeds = None
-        if self.config.embedding_init_target or self.config.two_stage:
-            query_embeds = self.query_position_embeddings.weight
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
 
-        # Prepare encoder inputs (by flattening)
-        source_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
-            batch_size, num_channels, height, width = source.shape
-            spatial_shape = (height, width)
-            spatial_shapes.append(spatial_shape)
-            source = source.flatten(2).transpose(1, 2)
-            mask = mask.flatten(1)
-            pos_embed = pos_embed.flatten(2).transpose(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            source_flatten.append(source)
-            mask_flatten.append(mask)
-        source_flatten = torch.cat(source_flatten, 1)
-        mask_flatten = torch.cat(mask_flatten, 1)
-        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
-        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
-        valid_ratios = valid_ratios.float()
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
 
-        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
-        # Also provide spatial_shapes, level_start_index and valid_ratios
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                vision_features=source_flatten,
-                vision_attention_mask=~mask_flatten,
-                vision_position_embedding=lvl_pos_embed_flatten,
-                spatial_shapes=spatial_shapes,
-                level_start_index=level_start_index,
-                valid_ratios=valid_ratios,
-                text_features=text_features,
-                text_attention_mask=~text_token_mask,
-                text_position_embedding=None,
-                text_self_attention_masks=text_self_attention_masks,
-                text_position_ids=position_ids,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
-            encoder_outputs = GroundingDinoEncoderOutput(
-                last_hidden_state_vision=encoder_outputs[0],
-                last_hidden_state_text=encoder_outputs[1],
-                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
-                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
-            )
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
 
-        # Fifth, prepare decoder inputs
-        enc_outputs_class = None
-        enc_outputs_coord_logits = None
-        if self.config.two_stage:
-            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
-                encoder_outputs[0], ~mask_flatten, spatial_shapes
-            )
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
 
-            # hack implementation for two-stage Deformable DETR
-            # apply a detection head to each pixel (A.4 in paper)
-            # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.encoder_output_class_embed(
-                object_query_embedding, encoder_outputs[1], text_token_mask
-            )
-            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
-            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
-            enc_outputs_coord_logits = delta_bbox + output_proposals
+        return outputs
 
-            # only keep top scoring `config.num_queries` proposals
-            topk = self.config.num_queries
-            topk_logits = enc_outputs_class.max(-1)[0]
-            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
-            topk_coords_logits = torch.gather(
-                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
-            )
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
 
-            topk_coords_logits = topk_coords_logits.detach()
-            reference_points = topk_coords_logits.sigmoid()
-            init_reference_points = reference_points
-            if query_embeds is not None:
-                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
+class GroundingDinoTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
             else:
-                target = torch.gather(
-                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
-                ).detach()
-        else:
-            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
-            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
-            init_reference_points = reference_points
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
 
-        decoder_outputs = self.decoder(
-            inputs_embeds=target,
-            vision_encoder_hidden_states=encoder_outputs[0],
-            vision_encoder_attention_mask=mask_flatten,
-            text_encoder_hidden_states=encoder_outputs[1],
-            text_encoder_attention_mask=~text_token_mask,
-            reference_points=reference_points,
-            spatial_shapes=spatial_shapes,
-            level_start_index=level_start_index,
-            valid_ratios=valid_ratios,
-            self_attn_mask=None,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
-            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
 
-            return tuple_outputs
 
-        return GroundingDinoModelOutput(
-            init_reference_points=init_reference_points,
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
-            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
-            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
-            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
-            encoder_attentions=encoder_outputs.attentions,
-            enc_outputs_class=enc_outputs_class,
-            enc_outputs_coord_logits=enc_outputs_coord_logits,
-        )
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText
+class GroundingDinoTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
 
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
 
-@add_start_docstrings(
-    """
-    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
-    for tasks such as COCO detection.
-    """,
-    GROUNDING_DINO_START_DOCSTRING,
-)
-class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
-    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
-    def __init__(self, config: GroundingDinoConfig):
-        super().__init__(config)
+class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel):
+    config_class = GroundingDinoTextConfig
 
-        # Deformable DETR encoder-decoder model
-        self.model = GroundingDinoModel(config)
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
 
-        # Detection heads on top
-        _class_embed = GroundingDinoContrastiveEmbedding(config)
-        _bbox_embed = GroundingDinoMLPPredictionHead(
-            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
-        )
+        self.embeddings = GroundingDinoTextEmbeddings(config)
+        self.encoder = GroundingDinoTextEncoder(config)
 
-        if config.decoder_bbox_embed_share:
-            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
-        else:
-            self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
-        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
-        # hack implementation for two-stage
-        self.model.decoder.bbox_embed = self.bbox_embed
-        self.model.decoder.class_embed = self.class_embed
+        self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
 
-    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor = None,
-        token_type_ids: torch.LongTensor = None,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
-        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ):
-        r"""
-        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
-            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
-            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
-            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
-
-        Returns:
-
-        Examples:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        ```python
-        >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
-        >>> from PIL import Image
-        >>> import requests
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> text = "a cat."
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
 
-        >>> inputs = processor(images=image, text=text, return_tensors="pt")
-        >>> outputs = model(**inputs)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
-        >>> target_sizes = torch.tensor([image.size[::-1]])
-        >>> results = processor.image_processor.post_process_object_detection(
-        ...     outputs, threshold=0.35, target_sizes=target_sizes
-        ... )[0]
-        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
-        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
-        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
 
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
-        outputs = self.model(
-            pixel_values=pixel_values,
+        embedding_output = self.embeddings(
             input_ids=input_ids,
-            attention_mask=attention_mask,
+            position_ids=position_ids,
             token_type_ids=token_type_ids,
-            pixel_mask=pixel_mask,
-            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
-        # index for encoder_last_hidden_state_text
-        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
-
-        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
-        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
-        init_reference = outputs.init_reference_points if return_dict else outputs[0]
-        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
-
-        # class logits + predicted bounding boxes
-        outputs_classes = []
-        outputs_coords = []
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        for level in range(hidden_states.shape[1]):
-            if level == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[:, level - 1]
-            reference = inverse_sigmoid(reference)
-            outputs_class = self.class_embed[level](
-                vision_hidden_state=hidden_states[:, level],
-                text_hidden_state=enc_text_hidden_state,
-                text_token_mask=attention_mask.bool(),
-            )
-            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
-            if reference.shape[-1] == 4:
-                outputs_coord_logits = delta_bbox + reference
-            elif reference.shape[-1] == 2:
-                delta_bbox[..., :2] += reference
-                outputs_coord_logits = delta_bbox
-            else:
-                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
-            outputs_coord = outputs_coord_logits.sigmoid()
-            outputs_classes.append(outputs_class)
-            outputs_coords.append(outputs_coord)
-        outputs_class = torch.stack(outputs_classes)
-        outputs_coord = torch.stack(outputs_coords)
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
 
-        logits = outputs_class[-1]
-        pred_boxes = outputs_coord[-1]
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
 
-        loss, loss_dict, auxiliary_outputs = None, None, None
-        if labels is not None:
-            # First: create the matcher
-            matcher = GroundingDinoHungarianMatcher(
-                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
-            )
-            # Second: create the criterion
-            losses = ["labels", "boxes", "cardinality"]
-            criterion = GroundingDinoLoss(
-                matcher=matcher,
-                num_classes=self.config.num_labels,
-                focal_alpha=self.config.focal_alpha,
-                losses=losses,
-            )
-            criterion.to(self.device)
-            # Third: compute the losses, based on outputs and labels
-            outputs_loss = {}
-            outputs_loss["logits"] = logits
-            outputs_loss["pred_boxes"] = pred_boxes
-            if self.config.auxiliary_loss:
-                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
-                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
-            if self.config.two_stage:
-                enc_outputs_coord = outputs[-1].sigmoid()
-                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
 
-            loss_dict = criterion(outputs_loss, labels)
-            # Fourth: compute total loss, as a weighted sum of the various losses
-            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
-            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
-            if self.config.auxiliary_loss:
-                aux_weight_dict = {}
-                for i in range(self.config.decoder_layers - 1):
-                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
-                weight_dict.update(aux_weight_dict)
-            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+    Returns:
+        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
+        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
+        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
+    """
+    batch_size, num_token = input_ids.shape
+    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask |= input_ids == special_token
 
-        if not return_dict:
-            if auxiliary_outputs is not None:
-                output = (logits, pred_boxes) + auxiliary_outputs + outputs
-            else:
-                output = (logits, pred_boxes) + outputs
-            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
 
-            return tuple_outputs
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
+    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
 
-        dict_outputs = GroundingDinoObjectDetectionOutput(
-            loss=loss,
-            loss_dict=loss_dict,
-            logits=logits,
-            pred_boxes=pred_boxes,
-            auxiliary_outputs=auxiliary_outputs,
-            last_hidden_state=outputs.last_hidden_state,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
-            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
-            encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
-            encoder_hidden_states_text=outputs.encoder_hidden_states_text,
-            encoder_attentions=outputs.encoder_attentions,
-            intermediate_hidden_states=outputs.intermediate_hidden_states,
-            intermediate_reference_points=outputs.intermediate_reference_points,
-            init_reference_points=outputs.init_reference_points,
-            enc_outputs_class=outputs.enc_outputs_class,
-            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
-        )
+        previous_col = col
 
-        return dict_outputs
+    return attention_mask, position_ids.to(torch.long)
 
 
-# Copied from transformers.models.detr.modeling_detr.dice_loss
-def dice_loss(inputs, targets, num_boxes):
+@add_start_docstrings(
     """
-    Compute the DICE loss, similar to generalized IOU for masks
-
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs (0 for the negative class and 1 for the positive
-                 class).
-    """
-    inputs = inputs.sigmoid()
-    inputs = inputs.flatten(1)
-    numerator = 2 * (inputs * targets).sum(1)
-    denominator = inputs.sum(-1) + targets.sum(-1)
-    loss = 1 - (numerator + 1) / (denominator + 1)
-    return loss.sum() / num_boxes
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+class GroundingDinoModel(GroundingDinoPreTrainedModel):
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
 
+        # Create backbone + positional encoding
+        backbone = GroundingDinoConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
 
-# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
-    """
-    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj_vision = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj_vision = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
 
-    Args:
-        inputs (`torch.FloatTensor` of arbitrary shape):
-            The predictions for each example.
-        targets (`torch.FloatTensor` with the same shape as `inputs`)
-            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
-            and 1 for the positive class).
-        alpha (`float`, *optional*, defaults to `0.25`):
-            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
-        gamma (`int`, *optional*, defaults to `2`):
-            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+        # Create text backbone
+        self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config)
+        self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
 
-    Returns:
-        Loss tensor
-    """
-    prob = inputs.sigmoid()
-    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
-    # add modulating factor
-    p_t = prob * targets + (1 - prob) * (1 - targets)
-    loss = ce_loss * ((1 - p_t) ** gamma)
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
 
-    if alpha >= 0:
-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
-        loss = alpha_t * loss
+        self.encoder = GroundingDinoEncoder(config)
+        self.decoder = GroundingDinoDecoder(config)
 
-    return loss.mean(1).sum() / num_boxes
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
 
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            if (
+                config.two_stage_bbox_embed_share
+                and config.decoder_bbox_embed_share
+                and self.decoder.bbox_embed is not None
+            ):
+                self.encoder_output_bbox_embed = self.decoder.bbox_embed
+            else:
+                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino
-class GroundingDinoLoss(nn.Module):
-    """
-    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
-    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
-    matched ground-truth / prediction (supervise class and box).
+            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
+        else:
+            self.reference_points = nn.Embedding(config.num_queries, 4)
 
-    Args:
-        matcher (`GroundingDinoHungarianMatcher`):
-            Module able to compute a matching between targets and proposals.
-        num_classes (`int`):
-            Number of object categories, omitting the special no-object category.
-        focal_alpha (`float`):
-            Alpha parameter in focal loss.
-        losses (`List[str]`):
-            List of all the losses to be applied. See `get_loss` for a list of all available losses.
-    """
+        self.post_init()
 
-    def __init__(self, matcher, num_classes, focal_alpha, losses):
-        super().__init__()
-        self.matcher = matcher
-        self.num_classes = num_classes
-        self.focal_alpha = focal_alpha
-        self.losses = losses
+    def get_encoder(self):
+        return self.encoder
 
-    # removed logging parameter, which was part of the original implementation
-    def loss_labels(self, outputs, targets, indices, num_boxes):
-        """
-        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
-        of dim [nb_target_boxes]
-        """
-        if "logits" not in outputs:
-            raise KeyError("No logits were found in the outputs")
-        source_logits = outputs["logits"]
+    def get_decoder(self):
+        return self.decoder
 
-        idx = self._get_source_permutation_idx(indices)
-        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(
-            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
-        )
-        target_classes[idx] = target_classes_o
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
 
-        target_classes_onehot = torch.zeros(
-            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
-            dtype=source_logits.dtype,
-            layout=source_logits.layout,
-            device=source_logits.device,
-        )
-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
 
-        target_classes_onehot = target_classes_onehot[:, :, :-1]
-        loss_ce = (
-            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
-            * source_logits.shape[1]
-        )
-        losses = {"loss_ce": loss_ce}
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
 
-        return losses
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
 
-    @torch.no_grad()
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
-    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
 
-        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
-        """
-        logits = outputs["logits"]
-        device = logits.device
-        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
-        # Count the number of predictions that are NOT "no-object" (which is the last class)
-        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
-        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
-        losses = {"cardinality_error": card_err}
-        return losses
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
 
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
 
-        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
-        are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        if "pred_boxes" not in outputs:
-            raise KeyError("No predicted boxes found in outputs")
-        idx = self._get_source_permutation_idx(indices)
-        source_boxes = outputs["pred_boxes"][idx]
-        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
 
-        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+        Args:
+            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
+            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
+            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
 
-        losses = {}
-        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        current_position = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(
+                batch_size, height, width, 1
+            )
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 
-        loss_giou = 1 - torch.diag(
-            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
-        )
-        losses["loss_giou"] = loss_giou.sum() / num_boxes
-        return losses
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
 
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
-    def _get_source_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
-        source_idx = torch.cat([source for (source, _) in indices])
-        return batch_idx, source_idx
-
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx
-    def _get_target_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
-        target_idx = torch.cat([target for (_, target) in indices])
-        return batch_idx, target_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes):
-        loss_map = {
-            "labels": self.loss_labels,
-            "cardinality": self.loss_cardinality,
-            "boxes": self.loss_boxes,
-        }
-        if loss not in loss_map:
-            raise ValueError(f"Loss {loss} not supported")
-        return loss_map[loss](outputs, targets, indices, num_boxes)
-
-    def forward(self, outputs, targets):
-        """
-        This performs the loss computation.
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            current_position += height * width
 
-        Args:
-             outputs (`dict`, *optional*):
-                Dictionary of tensors, see the output specification of the model for the format.
-             targets (`List[dict]`, *optional*):
-                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
-                losses applied, see each loss' doc.
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
 
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
 
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["class_labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        # (Niels): comment out function below, distributed training to be added
-        # if is_dist_avail_and_initialized():
-        #     torch.distributed.all_reduce(num_boxes)
-        # (Niels) in original implementation, num_boxes is divided by get_world_size()
-        num_boxes = torch.clamp(num_boxes, min=1).item()
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        token_type_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        pixel_mask: Optional[Tensor] = None,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
 
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+        Examples:
 
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if "auxiliary_outputs" in outputs:
-            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
-                indices = self.matcher(auxiliary_outputs, targets)
-                for loss in self.losses:
-                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
-                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
-                    losses.update(l_dict)
+        ```python
+        >>> from transformers import AutoProcessor, GroundingDinoModel
+        >>> from PIL import Image
+        >>> import requests
 
-        if "enc_outputs" in outputs:
-            enc_outputs = outputs["enc_outputs"]
-            bin_targets = copy.deepcopy(targets)
-            for bt in bin_targets:
-                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
-            indices = self.matcher(enc_outputs, bin_targets)
-            for loss in self.losses:
-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
-                l_dict = {k + "_enc": v for k, v in l_dict.items()}
-                losses.update(l_dict)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
 
-        return losses
+        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
 
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
-class GroundingDinoMLPPredictionHead(nn.Module):
-    """
-    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
-    height and width of a bounding box w.r.t. an image.
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
 
-    """
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
 
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
 
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
+        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
 
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
-class GroundingDinoHungarianMatcher(nn.Module):
-    """
-    This class computes an assignment between the targets and the predictions of the network.
+        # Extract text features from text backbone
+        text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
+            "last_hidden_state"
+        ]
+        text_features = self.text_projection(text_features)
 
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
-    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
-    un-matched (and thus treated as non-objects).
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
 
-    Args:
-        class_cost:
-            The relative weight of the classification error in the matching cost.
-        bbox_cost:
-            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
-        giou_cost:
-            The relative weight of the giou loss of the bounding box in the matching cost.
-    """
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
 
-    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
-        super().__init__()
-        requires_backends(self, ["scipy"])
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
 
-        self.class_cost = class_cost
-        self.bbox_cost = bbox_cost
-        self.giou_cost = giou_cost
-        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
-            raise ValueError("All costs of the Matcher can't be 0")
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(vision_features):
+            sources.append(self.input_proj_vision[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
 
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """
-        Args:
-            outputs (`dict`):
-                A dictionary that contains at least these entries:
-                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
-            targets (`List[dict]`):
-                A list of targets (len(targets) = batch_size), where each target is a dict containing:
-                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
-                  ground-truth
-                 objects in the target) containing the class labels
-                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj_vision[level](vision_features[-1][0])
+                else:
+                    source = self.input_proj_vision[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
 
-        Returns:
-            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
-            - index_i is the indices of the selected predictions (in order)
-            - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        batch_size, num_queries = outputs["logits"].shape[:2]
+        # Create queries
+        query_embeds = None
+        if self.config.embedding_init_target or self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
 
-        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
 
-        # Also concat the target labels and boxes
-        target_ids = torch.cat([v["class_labels"] for v in targets])
-        target_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost.
-        alpha = 0.25
-        gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
-        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                vision_features=source_flatten,
+                vision_attention_mask=~mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=~text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
+            encoder_outputs = GroundingDinoEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
+            )
 
-        # Compute the L1 cost between boxes
-        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+        # Fifth, prepare decoder inputs
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
 
-        # Compute the giou cost between boxes
-        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+            # hack implementation for two-stage Deformable DETR
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.encoder_output_class_embed(
+                object_query_embedding, encoder_outputs[1], text_token_mask
+            )
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
 
-        # Final cost matrix
-        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
-        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+            # only keep top scoring `config.num_queries` proposals
+            topk = self.config.num_queries
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
 
-        sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            if query_embeds is not None:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
+        else:
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
+            init_reference_points = reference_points
 
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=~text_token_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            self_attn_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-# Copied from transformers.models.detr.modeling_detr._upcast
-def _upcast(t: Tensor) -> Tensor:
-    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
-    if t.is_floating_point():
-        return t if t.dtype in (torch.float32, torch.float64) else t.float()
-    else:
-        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
 
+            return tuple_outputs
 
-# Copied from transformers.models.detr.modeling_detr.box_area
-def box_area(boxes: Tensor) -> Tensor:
-    """
-    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+        return GroundingDinoModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
+            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
 
-    Args:
-        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
-            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
-            < x2` and `0 <= y1 < y2`.
 
-    Returns:
-        `torch.FloatTensor`: a tensor containing the area for each box.
+@add_start_docstrings(
     """
-    boxes = _upcast(boxes)
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
-# Copied from transformers.models.detr.modeling_detr.box_iou
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
 
-    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+        # Deformable DETR encoder-decoder model
+        self.model = GroundingDinoModel(config)
 
-    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
-    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+        # Detection heads on top
+        _class_embed = GroundingDinoContrastiveEmbedding(config)
+        _bbox_embed = GroundingDinoMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
 
-    union = area1[:, None] + area2 - inter
+        if config.decoder_bbox_embed_share:
+            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
+        else:
+            self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
+        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
+        # hack implementation for two-stage
+        self.model.decoder.bbox_embed = self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
 
-    iou = inter / union
-    return iou, union
+        # Initialize weights and apply final processing
+        self.post_init()
 
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
-# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor = None,
+        token_type_ids: torch.LongTensor = None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
 
-    Returns:
-        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
-        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
-    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
-        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
-    iou, union = box_iou(boxes1, boxes2)
+        Returns:
 
-    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+        Examples:
 
-    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
-    area = width_height[:, :, 0] * width_height[:, :, 1]
+        ```python
+        >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
 
-    return iou - (area - union) / area
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
 
+        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
-# Copied from transformers.models.detr.modeling_detr._max_by_axis
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
 
+        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = processor.image_processor.post_process_object_detection(
+        ...     outputs, threshold=0.35, target_sizes=target_sizes
+        ... )[0]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
+        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
+        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-# Copied from transformers.models.detr.modeling_detr.NestedTensor
-class NestedTensor(object):
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
 
-    def to(self, device):
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-    def decompose(self):
-        return self.tensors, self.mask
+        # index for encoder_last_hidden_state_text
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
 
-    def __repr__(self):
-        return str(self.tensors)
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
 
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
 
-# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    if tensor_list[0].ndim == 3:
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        batch_shape = [len(tensor_list)] + max_size
-        batch_size, num_channels, height, width = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("Only 3-dimensional tensors are supported")
-    return NestedTensor(tensor, mask)
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=attention_mask.bool(),
+            )
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
 
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
-class GroundingDinoTextEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = GroundingDinoHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = GroundingDinoLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs[-1].sigmoid()
+                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
 
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
 
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values_length: int = 0,
-    ) -> torch.Tensor:
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
+            return tuple_outputs
 
-        seq_length = input_shape[1]
+        dict_outputs = GroundingDinoObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
+            encoder_hidden_states_text=outputs.encoder_hidden_states_text,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+        )
 
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        return dict_outputs
 
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+# Copied from transformers.models.detr.modeling_detr.dice_loss
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
 
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
-class GroundingDinoTextSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
+# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
 
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
 
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
 
-        self.is_decoder = config.is_decoder
+    return loss.mean(1).sum() / num_boxes
 
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino
+class GroundingDinoLoss(nn.Module):
+    """
+    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
+    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
+    matched ground-truth / prediction (supervise class and box).
 
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
+    Args:
+        matcher (`GroundingDinoHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
 
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
 
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
 
-        use_cache = past_key_value is not None
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
 
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
 
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
-            if use_cache:
-                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-                    -1, 1
-                )
-            else:
-                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
 
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+        return losses
 
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+    @torch.no_grad()
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
 
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
-            attention_scores = attention_scores + attention_mask
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
 
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
 
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
 
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
 
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
 
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
+    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
 
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
-class GroundingDinoTextSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
 
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
-class GroundingDinoTextAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = GroundingDinoTextSelfOutput(config)
-        self.pruned_heads = set()
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
 
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
 
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
 
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
+        return losses
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
-class GroundingDinoTextIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDinoMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
 
+    """
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
-class GroundingDinoTextOutput(nn.Module):
-    def __init__(self, config):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
-class GroundingDinoTextLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = GroundingDinoTextAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = GroundingDinoTextIntermediate(config)
-        self.output = GroundingDinoTextOutput(config)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
+class GroundingDinoHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
 
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
 
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`"
-                )
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
 
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
 
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
 
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
 
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
 
-        return outputs
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
 
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
-class GroundingDinoTextEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
 
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
 
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
 
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
 
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText
-class GroundingDinoTextPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
 
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 
-class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel):
-    config_class = GroundingDinoTextConfig
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
 
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
+    union = area1[:, None] + area2 - inter
 
-        self.embeddings = GroundingDinoTextEmbeddings(config)
-        self.encoder = GroundingDinoTextEncoder(config)
+    iou = inter / union
+    return iou, union
 
-        self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None
 
-        # Initialize weights and apply final processing
-        self.post_init()
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
 
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
 
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
 
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    return iou - (area - union) / area
 
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+# Copied from transformers.models.detr.modeling_detr._max_by_axis
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
 
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
 
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
 
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+    def decompose(self):
+        return self.tensors, self.mask
 
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+    def __repr__(self):
+        return str(self.tensors)
 
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)

From 06ba0ecdf65223c6dc0356dcba24a4adc6ecc654 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sun, 4 Feb 2024 01:18:27 +0100
Subject: [PATCH 161/252] Fixed processing to avoid messing with inputs

---
 .../models/grounding_dino/processing_grounding_dino.py         | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 0e658a42f77baa..7e6d2a2b29f9d2 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -40,6 +40,9 @@ def get_phrases_from_posmap(posmaps, input_ids):
     left_idx = 0
     right_idx = 255
 
+    # Avoiding altering the input tensor
+    posmaps = posmaps.clone()
+
     posmaps[:, 0 : left_idx + 1] = False
     posmaps[:, right_idx:] = False
 

From 9cda12eba36e84add9d82f305c17dccb80aaef3d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sun, 4 Feb 2024 02:26:47 +0100
Subject: [PATCH 162/252] Added more tips for GroundingDino

---
 docs/source/en/model_doc/grounding-dino.md | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index f3ccc78ad5c876..2c6bbf735cd0eb 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -27,6 +27,40 @@ The abstract from the paper is the following:
 Tips:
 
 - One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model.
+- To separate classes in the text use a period e.g. "a cat. a dog."
+- When using multiple classes use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs
+
+```python
+import requests
+
+import torch
+from PIL import Image
+from transformers import AutoModelForObjectDetection, AutoProcessor
+
+model_id = "EduardoPacheco/grounding-dino-tiny"
+
+model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+
+def load_image(url):
+    return Image.open(requests.get(url, stream=True).raw)
+
+image = load_image('http://images.cocodataset.org/val2017/000000039769.jpg')
+# Check for cats and remote controls
+text = "a cat. a remote control"
+
+inputs = processor(images=image, text=text, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+
+results = processor.post_process_grounded_object_detection(
+    outputs,
+    inputs.input_ids,
+    bbox_threshold=0.4
+    text_threshold=0.3,
+    target_sizes=[image.size[::-1]]
+)
+```
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
 alt="drawing" width="600"/>
@@ -46,6 +80,7 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 ## GroundingDinoProcessor
 
 [[autodoc]] GroundingDinoProcessor
+    - post_process_grounded_object_detection
 
 ## GroundingDinoTextConfig
 

From bde2c6a6d4137e3f11df525c817abc05b74d0ece Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sun, 4 Feb 2024 02:27:02 +0100
Subject: [PATCH 163/252] Make style

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 7edaa5dbbfb827..ab6a2e9df1c1e9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2069,7 +2069,6 @@ def custom_forward(*inputs):
         )
 
 
-
 # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
 class GroundingDinoTextEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
@@ -2672,6 +2671,7 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
+
 SPECIAL_TOKENS = [101, 102, 1012, 1029]
 
 

From 01c382e7544e658c4ce7bb0ecdd11c552f700d81 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sun, 4 Feb 2024 02:30:22 +0100
Subject: [PATCH 164/252] Chaning name to align with SAM

---
 .../grounding_dino/modeling_grounding_dino.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ab6a2e9df1c1e9..c2cd8aa74ac454 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -206,11 +206,11 @@ class GroundingDinoEncoderOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the vision encoder.
         last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the text encoder.
-        hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
             layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
             output of each layer plus the initial embedding outputs.
-        hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
@@ -223,8 +223,8 @@ class GroundingDinoEncoderOutput(ModelOutput):
 
     last_hidden_state_vision: torch.FloatTensor = None
     last_hidden_state_text: torch.FloatTensor = None
-    hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
 
@@ -1626,7 +1626,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
-            `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`)
+            `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`)
             `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence
             of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
             decoder.
@@ -1803,8 +1803,8 @@ def forward(
         return GroundingDinoEncoderOutput(
             last_hidden_state_vision=vision_features,
             last_hidden_state_text=text_features,
-            hidden_states_vision=encoder_vision_states,
-            hidden_states_text=encoder_text_states,
+            vision_hidden_states=encoder_vision_states,
+            text_hidden_states=encoder_text_states,
             attentions=all_attns,
         )
 
@@ -3042,8 +3042,8 @@ def forward(
             encoder_outputs = GroundingDinoEncoderOutput(
                 last_hidden_state_vision=encoder_outputs[0],
                 last_hidden_state_text=encoder_outputs[1],
-                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
                 attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
             )
 
@@ -3118,8 +3118,8 @@ def forward(
             decoder_attentions=decoder_outputs.attentions,
             encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
             encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
-            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
-            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
+            encoder_hidden_states_vision=encoder_outputs.vision_hidden_states,
+            encoder_hidden_states_text=encoder_outputs.text_hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,

From 5d1f0e77c1b96a2820722ae49bed1e67b8c62954 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 13:17:34 +0100
Subject: [PATCH 165/252] Replace final nn.multiheadattention

---
 .../convert_grounding_dino_to_hf.py           | 28 +++++++++++++++++++
 .../grounding_dino/modeling_grounding_dino.py | 27 ++++++++++++------
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 3d9b7673fbef38..8af08626022183 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -313,6 +313,33 @@ def read_in_q_k_v_encoder(state_dict, config):
     ########################################## VISION BACKBONE - END
 
 
+def read_in_q_k_v_text_enhancer(state_dict, config):
+    hidden_size = config.hidden_size
+    for idx in range(config.encoder_layers):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[
+            :hidden_size, :
+        ]
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size]
+
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[
+            hidden_size : hidden_size * 2
+        ]
+
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[
+            -hidden_size:, :
+        ]
+        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[
+            -hidden_size:
+        ]
+
+
 def read_in_q_k_v_decoder(state_dict, config):
     hidden_size = config.hidden_size
     for idx in range(config.decoder_layers):
@@ -393,6 +420,7 @@ def convert_grounding_dino_checkpoint(args):
     for src, dest in rename_keys:
         rename_key(new_state_dict, src, dest)
     read_in_q_k_v_encoder(new_state_dict, config)
+    read_in_q_k_v_text_enhancer(new_state_dict, config)
     read_in_q_k_v_decoder(new_state_dict, config)
 
     # Load HF model
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f99ec9ab2c9717..add89a2927ecbd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -766,12 +766,10 @@ class GroundingDinoTextEnhancerLayer(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(
-            embed_dim=config.d_model,
-            num_heads=config.encoder_attention_heads // 2,
-            dropout=config.text_enhancer_dropout,
-            batch_first=True,
-        )
+        mha_config = copy.deepcopy(config)
+        mha_config.num_attention_heads = config.encoder_attention_heads // 2
+        self.self_attn = GroundingDinoMultiheadAttention(mha_config)
+
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
@@ -814,12 +812,23 @@ def forward(
 
         # repeat attn mask
         if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
-            # bs, num_q, num_k
-            attention_masks = attention_masks.repeat(self.num_heads, 1, 1)
+            # batch_size, num_queries, num_keys
+            # TODO we shouldn't switch the attention mask here
+            attention_masks = ~attention_masks
+            attention_masks = attention_masks[:, None, :, :]
+            attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1)
+
+            dtype = torch.float16
+            attention_masks = attention_masks.to(dtype=dtype)  # fp16 compatibility
+            attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min
 
         queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
         attention_output, attention_weights = self.self_attn(
-            query=queries, key=keys, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=attention_masks,
+            output_attentions=True,
         )
         attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
         hidden_states = hidden_states + attention_output

From 339915f1d7c8896f3c9191f6c3380cd02da0d09e Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 14:41:55 +0100
Subject: [PATCH 166/252] Fix model tests

---
 .../models/grounding_dino/modeling_grounding_dino.py           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index add89a2927ecbd..cbf63b6e7fcce2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1417,11 +1417,12 @@ def forward(
 
         # Cross-Attention Text
         queries = self.with_pos_embed(hidden_states, position_embeddings)
+
         hidden_states, text_cross_attn_weights = self.encoder_attn_text(
             queries=queries,
             keys=text_encoder_hidden_states,
             values=text_encoder_hidden_states,
-            attention_mask=text_encoder_attention_mask,
+            # attention_mask=text_encoder_attention_mask, # TODO fix cross-attention mask here
             output_attentions=True,
         )
 

From 1bb488639ea31d26b74782fa3edc888e5ce81ee1 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 14:47:25 +0100
Subject: [PATCH 167/252] Update year, remove GenerationTesterMixin

---
 docs/source/en/model_doc/grounding-dino.md                   | 2 +-
 src/transformers/models/grounding_dino/__init__.py           | 2 +-
 .../models/grounding_dino/configuration_grounding_dino.py    | 2 +-
 .../models/grounding_dino/convert_grounding_dino_to_hf.py    | 2 +-
 .../models/grounding_dino/image_processing_grounding_dino.py | 2 +-
 .../models/grounding_dino/modeling_grounding_dino.py         | 2 +-
 .../models/grounding_dino/processing_grounding_dino.py       | 2 +-
 .../grounding_dino/test_image_processing_grounding_dino.py   | 2 +-
 tests/models/grounding_dino/test_modeling_grounding_dino.py  | 5 ++---
 tests/models/grounding_dino/test_processor_grounding_dino.py | 2 +-
 10 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index f3ccc78ad5c876..cc431ef448cc3f 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index 6dfe21cf83d5e0..bc046a418d9051 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index e7091ba2b695d7..8337a6928feea9 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 8af08626022183..912268c01c551a 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 4565b744b0a774..603f4d05d29d35 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index cbf63b6e7fcce2..5d2808ddada6cb 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 IDEA Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 0e658a42f77baa..ad675e455c721b 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 51bd5807991458..466c6825a257ae 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 HuggingFace Inc.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index fc41dfb3a2349c..247b0cff837649 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,7 +38,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -202,7 +201,7 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix
 
 
 @require_torch
-class GroundingDinoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (GroundingDinoModel, GroundingDinoForObjectDetection) if is_torch_available() else ()
     is_encoder_decoder = True
     test_torchscript = False
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 44283bc69737e6..32a6a386d01a6a 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 4bb58d3b09542feaff4b4ba7d26a35455c56dc4e Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 14:50:11 +0100
Subject: [PATCH 168/252] Address comments

---
 .../test_modeling_grounding_dino.py           | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 247b0cff837649..3fd8b04fef6958 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -594,21 +594,19 @@ def test_initialization(self):
 
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
-            print("Model class:", model_class)
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    if param.requires_grad:
-                        if (
-                            "level_embed" in name
-                            or "sampling_offsets.bias" in name
-                            or "text_param" in name
-                            or "vision_param" in name
-                            or "value_proj" in name
-                            or "output_proj" in name
-                            or "reference_points" in name
-                        ):
-                            continue
+                    if (
+                        "level_embed" in name
+                        or "sampling_offsets.bias" in name
+                        or "text_param" in name
+                        or "vision_param" in name
+                        or "value_proj" in name
+                        or "output_proj" in name
+                        or "reference_points" in name
+                    ):
+                        continue
                     self.assertIn(
                         ((param.data.mean() * 1e9).round() / 1e9).item(),
                         [0.0, 1.0],
@@ -666,9 +664,6 @@ def test_tied_weights_keys(self):
             )
 
 
-TOLERANCE = 1e-4
-
-
 # We will verify our results on an image of cute cats
 def prepare_img():
     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")

From 2c5d4eae40608f4589eedd5a03cb16a4ba7e256f Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 14:56:55 +0100
Subject: [PATCH 169/252] Address more comments

---
 .../grounding_dino/modeling_grounding_dino.py    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 5d2808ddada6cb..77f5a069a2867a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2121,8 +2121,8 @@ def __init__(self, config: GroundingDinoConfig):
         if config.num_feature_levels > 1:
             num_backbone_outs = len(backbone.intermediate_channel_sizes)
             input_proj_list = []
-            for _ in range(num_backbone_outs):
-                in_channels = backbone.intermediate_channel_sizes[_]
+            for i in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[i]
                 input_proj_list.append(
                     nn.Sequential(
                         nn.Conv2d(in_channels, config.d_model, kernel_size=1),
@@ -2294,7 +2294,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoProcessor, GroundingDinoModel
+        >>> from transformers import AutoProcessor, AutoModel
         >>> from PIL import Image
         >>> import requests
 
@@ -2303,7 +2303,7 @@ def forward(
         >>> text = "a cat."
 
         >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = AutoModel.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2631,8 +2631,8 @@ def forward(
 
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
         enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
-        init_reference = outputs.init_reference_points if return_dict else outputs[0]
-        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
 
         # class logits + predicted bounding boxes
         outputs_classes = []
@@ -2640,9 +2640,9 @@ def forward(
 
         for level in range(hidden_states.shape[1]):
             if level == 0:
-                reference = init_reference
+                reference = init_reference_points
             else:
-                reference = inter_references[:, level - 1]
+                reference = inter_references_points[:, level - 1]
             reference = inverse_sigmoid(reference)
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],

From f21162c3123d0ffef4f125babe845c3b54c045cf Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 19:29:33 +0100
Subject: [PATCH 170/252] Rename TextPrenet to TextModel

---
 .../configuration_grounding_dino.py           | 34 ++++-----
 .../convert_grounding_dino_to_hf.py           |  2 +-
 .../grounding_dino/modeling_grounding_dino.py | 76 +++++--------------
 .../test_modeling_grounding_dino.py           |  2 +-
 utils/check_repo.py                           |  2 +-
 5 files changed, 37 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 8337a6928feea9..5a9b05586da590 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -30,7 +30,7 @@
 
 class GroundingDinoTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GroundingDinoTextPrenetModel`]. It is used to
+    This is the configuration class to store the configuration of a [`GroundingDinoTextModel`]. It is used to
     instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BERT
     [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
@@ -41,7 +41,7 @@ class GroundingDinoTextConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GroundingDinoTextPrenetModel`].
+            `inputs_ids` passed when calling [`GroundingDinoTextModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -61,7 +61,7 @@ class GroundingDinoTextConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDinoTextPrenetModel`].
+            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDinoTextModel`].
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         pad_token_id (`int`, *optional*, defaults to 0):
@@ -80,11 +80,11 @@ class GroundingDinoTextConfig(PretrainedConfig):
     ```python
     >>> from transformers import GroundingDinoTextConfig, GroundingDinoConfig, GroundingDinoForObjectDetection
 
-    >>> # Initializing a BERT bert-base-uncased style configuration
-    >>> configuration = GroundingDinoTextConfig()
+    >>> # Initializing a BERT bert-base-uncased style text configuration
+    >>> text_config = GroundingDinoTextConfig()
 
-    >>> # Initializing a GroundingDinoConfig with generated bert-like config
-    >>> config = GroundingDinoConfig(text_backbone_config=configuration)
+    >>> # Initializing a Grounding DINO configuration with the text configuration
+    >>> config = GroundingDinoConfig(text_config=text_config)
 
     >>> # Initializing a model from the ground-up with a config
     >>> model = GroundingDinoForObjectDetection(config)
@@ -137,7 +137,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         # get the text config dict if we are loading from CLIPSegConfig
         if config_dict.get("model_type") == "grounding-dino":
-            config_dict = config_dict["text_backbone_config"]
+            config_dict = config_dict["text_config"]
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
@@ -161,7 +161,7 @@ class GroundingDinoConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        text_backbone_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
+        text_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
@@ -265,7 +265,7 @@ class GroundingDinoConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
-        text_backbone_config=None,
+        text_config=None,
         num_queries=900,
         encoder_layers=6,
         encoder_ffn_dim=2048,
@@ -342,15 +342,15 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        if text_backbone_config is None:
-            self.text_backbone_config = GroundingDinoTextConfig()
-        elif isinstance(text_backbone_config, dict):
-            self.text_backbone_config = GroundingDinoTextConfig(**text_backbone_config)
-        elif isinstance(text_backbone_config, GroundingDinoTextConfig):
-            self.text_backbone_config = text_backbone_config
+        if text_config is None:
+            self.text_config = GroundingDinoTextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = GroundingDinoTextConfig(**text_config)
+        elif isinstance(text_config, GroundingDinoTextConfig):
+            self.text_config = text_config
         else:
             raise ValueError(
-                f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDinoTextConfig`. Received {type(text_backbone_config)} instead."
+                f"`text_config` should be either a `dict` or an instance of `GroundingDinoTextConfig`. Received {type(text_config)} instead."
             )
         self.max_text_len = max_text_len
         # Text Enhancer
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 912268c01c551a..ac8e82bfd825d6 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert GroundingDino SimMIM checkpoints from the original repository.
+"""Convert Grounding DINO checkpoints from the original repository.
 
 URL: https://github.com/IDEA-Research/GroundingDINO"""
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 77f5a069a2867a..ce4ed929735ee4 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -40,8 +40,8 @@
     requires_backends,
 )
 from ...modeling_outputs import (
+    BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPooling,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
@@ -791,7 +791,7 @@ def forward(
         position_embeddings: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
         """Text self-attention to enhance projection of text features generated by
-        the text encoder (GroundingDinoTextPrenet) within GroundingDinoEncoderLayer
+        the text encoder (GroundingDinoTextModel) within GroundingDinoEncoderLayer
 
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
@@ -2149,8 +2149,8 @@ def __init__(self, config: GroundingDinoConfig):
             )
 
         # Create text backbone
-        self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config)
-        self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model)
+        self.text_backbone = GroundingDinoTextModel(config.text_config)
+        self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
             self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
@@ -2337,9 +2337,10 @@ def forward(
             text_token_mask = text_token_mask[:, :max_text_len]
 
         # Extract text features from text backbone
-        text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[
-            "last_hidden_state"
-        ]
+        text_outputs = self.text_backbone(
+            input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
+        )
+        text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
         text_features = self.text_projection(text_features)
 
         batch_size, num_channels, height, width = pixel_values.shape
@@ -3664,37 +3665,16 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText
-class GroundingDinoTextPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel):
-    config_class = GroundingDinoTextConfig
+class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
+    """Grounding DINO text encoder, BERT-like."""
 
-    def __init__(self, config, add_pooling_layer=True):
+    def __init__(self, config: GroundingDinoTextConfig):
         super().__init__(config)
         self.config = config
 
         self.embeddings = GroundingDinoTextEmbeddings(config)
         self.encoder = GroundingDinoTextEncoder(config)
 
-        self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -3711,34 +3691,23 @@ class PreTrainedModel
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor] = None,
+        input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
+        input_shape = input_ids.shape
         batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        device = input_ids.device
 
         if attention_mask is None:
             attention_mask = torch.ones(((batch_size, seq_length)), device=device)
@@ -3755,36 +3724,25 @@ def forward(
         # ourselves in which case we just need to make it broadcastable to all heads.
         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
         embedding_output = self.embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
             token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
         )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,
-            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return (sequence_output,) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPooling(
+        return BaseModelOutput(
             last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 3fd8b04fef6958..7e86dd8a3c8209 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -166,7 +166,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=swin_config,
             max_text_len=self.max_text_len,
-            text_backbone_config=text_backbone,
+            text_config=text_backbone,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 32553472255ea9..2c44949c12188b 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -237,7 +237,7 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
-    "GroundingDinoTextPrenet",
+    "GroundingDinoTextModel",
     "InstructBlipVisionModel",
     "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",

From 48f173457905e635acdcfd4bbf45fe29d0b3689b Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 19:31:29 +0100
Subject: [PATCH 171/252] Rename hidden_states

---
 .../grounding_dino/modeling_grounding_dino.py | 42 +++++++++----------
 .../test_modeling_grounding_dino.py           |  6 +--
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ce4ed929735ee4..3cd9e04585f82a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -206,11 +206,11 @@ class GroundingDinoEncoderOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the vision encoder.
         last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the text encoder.
-        hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
             layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
             output of each layer plus the initial embedding outputs.
-        hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
@@ -223,8 +223,8 @@ class GroundingDinoEncoderOutput(ModelOutput):
 
     last_hidden_state_vision: torch.FloatTensor = None
     last_hidden_state_text: torch.FloatTensor = None
-    hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
 
@@ -254,11 +254,11 @@ class GroundingDinoModelOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
             layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
             output of each layer plus the initial embedding outputs.
-        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
@@ -284,8 +284,8 @@ class GroundingDinoModelOutput(ModelOutput):
     decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
-    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
@@ -328,11 +328,11 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
         encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
             layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
             output of each layer plus the initial embedding outputs.
-        encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
             of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
             each layer plus the initial embedding outputs.
@@ -368,8 +368,8 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
-    encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
@@ -1616,7 +1616,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
-            `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`)
+            `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`)
             `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence
             of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
             decoder.
@@ -1793,8 +1793,8 @@ def forward(
         return GroundingDinoEncoderOutput(
             last_hidden_state_vision=vision_features,
             last_hidden_state_text=text_features,
-            hidden_states_vision=encoder_vision_states,
-            hidden_states_text=encoder_text_states,
+            vision_hidden_states=encoder_vision_states,
+            text_hidden_states=encoder_text_states,
             attentions=all_attns,
         )
 
@@ -2430,8 +2430,8 @@ def forward(
             encoder_outputs = GroundingDinoEncoderOutput(
                 last_hidden_state_vision=encoder_outputs[0],
                 last_hidden_state_text=encoder_outputs[1],
-                hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-                hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
                 attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
             )
 
@@ -2506,8 +2506,8 @@ def forward(
             decoder_attentions=decoder_outputs.attentions,
             encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
             encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
-            encoder_hidden_states_vision=encoder_outputs.hidden_states_vision,
-            encoder_hidden_states_text=encoder_outputs.hidden_states_text,
+            encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
+            encoder_text_hidden_states=encoder_outputs.text_hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
@@ -2724,8 +2724,8 @@ def forward(
             decoder_attentions=outputs.decoder_attentions,
             encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
             encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
-            encoder_hidden_states_vision=outputs.encoder_hidden_states_vision,
-            encoder_hidden_states_text=outputs.encoder_hidden_states_text,
+            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
+            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 7e86dd8a3c8209..d9bfda283570c0 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -466,7 +466,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = outputs.encoder_hidden_states_vision
+            hidden_states = outputs.encoder_vision_hidden_states
 
             expected_num_layers = getattr(
                 self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
@@ -480,7 +480,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                 [seq_len, self.model_tester.hidden_size],
             )
 
-            hidden_states = outputs.encoder_hidden_states_text
+            hidden_states = outputs.encoder_text_hidden_states
 
             self.assertEqual(len(hidden_states), expected_num_layers)
 
@@ -535,7 +535,7 @@ def test_retain_grad_hidden_states_attentions(self):
         # we take the second output since last_hidden_state is the second item
         output = outputs[1]
 
-        encoder_hidden_states = outputs.encoder_hidden_states_vision[0]
+        encoder_hidden_states = outputs.encoder_vision_hidden_states[0]
         encoder_attentions = outputs.encoder_attentions[0][0]
         encoder_hidden_states.retain_grad()
         encoder_attentions.retain_grad()

From d3f45c351ff87b2fbd48fd9e7315e42f71d05d59 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 19:38:24 +0100
Subject: [PATCH 172/252] Address more comments

---
 .../grounding_dino/modeling_grounding_dino.py | 33 +++++++------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 3cd9e04585f82a..43ecaf4c07a072 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -516,26 +516,18 @@ class GroundingDinoSinePositionEmbedding(nn.Module):
     need paper, generalized to work on images.
     """
 
-    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+    def __init__(self, embedding_dim=64, temperature=10000):
         super().__init__()
         self.embedding_dim = embedding_dim
         self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and normalize is False:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
+        self.scale = 2 * math.pi
 
     def forward(self, pixel_values, pixel_mask):
-        if pixel_mask is None:
-            raise ValueError("No pixel mask provided")
         y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
         x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
-        if self.normalize:
-            eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        eps = 1e-6
+        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
@@ -576,9 +568,7 @@ def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
         # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDinoSinePositionEmbedding(
-            n_steps, config.positional_embedding_temperature, normalize=True
-        )
+        position_embedding = GroundingDinoSinePositionEmbedding(n_steps, config.positional_embedding_temperature)
     elif config.position_embedding_type == "learned":
         position_embedding = GroundingDinoLearnedPositionEmbedding(n_steps)
     else:
@@ -875,7 +865,7 @@ def __init__(self, config):
         self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
         self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
@@ -915,16 +905,16 @@ def forward(
         batch_size, tgt_len, _ = vision_features.size()
 
         vision_query_states = self.vision_proj(vision_features) * self.scale
-        vision_query_states = self._shape(vision_query_states, tgt_len, batch_size)
+        vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size)
 
         text_key_states = self.text_proj(text_features)
-        text_key_states = self._shape(text_key_states, -1, batch_size)
+        text_key_states = self._reshape(text_key_states, -1, batch_size)
 
         vision_value_states = self.values_vision_proj(vision_features)
-        vision_value_states = self._shape(vision_value_states, -1, batch_size)
+        vision_value_states = self._reshape(vision_value_states, -1, batch_size)
 
         text_value_states = self.values_text_proj(text_features)
-        text_value_states = self._shape(text_value_states, -1, batch_size)
+        text_value_states = self._reshape(text_value_states, -1, batch_size)
 
         proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
 
@@ -1094,7 +1084,6 @@ def forward(
         return (vision_features, vision_attn), (text_features, text_attn)
 
 
-# NOTE just renamed the class
 class GroundingDinoDeformableLayer(nn.Module):
     def __init__(self, config: GroundingDinoConfig):
         super().__init__()

From 3134d39a20ff6e1b7f8877b84fa052e2ad033d26 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 19:48:06 +0100
Subject: [PATCH 173/252] Address more comments

---
 .../grounding_dino/modeling_grounding_dino.py | 204 +++++++++---------
 1 file changed, 105 insertions(+), 99 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 43ecaf4c07a072..d838da3cc9bd6d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1023,7 +1023,7 @@ def extra_repr(self) -> str:
 
 
 class GroundingDinoFusionLayer(nn.Module):
-    def __init__(self, config, init_values=1e-4):
+    def __init__(self, config):
         super().__init__()
         drop_path = config.fusion_droppath
 
@@ -1034,6 +1034,7 @@ def __init__(self, config, init_values=1e-4):
 
         # add layer scale for training stability
         self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        init_values = 1e-4
         self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
         self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True)
 
@@ -1164,21 +1165,25 @@ def forward(
         return hidden_states, attn_weights
 
 
+# Based on https://github.com/IDEA-Research/GroundingDINO/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/groundingdino/models/GroundingDINO/utils.py#L24
 def get_sine_pos_embed(
-    pos_tensor: torch.Tensor,
-    num_pos_feats: int = 128,
-    temperature: int = 10000,
-    exchange_xy: bool = True,
+    pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True
 ) -> Tensor:
-    """generate sine position embedding from a position tensor
+    """
+    Generate sine position embeddings from a position tensor.
+
     Args:
-        pos_tensor (torch.Tensor): shape: [..., n].
-        num_pos_feats (int): projected shape for each float in the tensor.
-        temperature (int): temperature in the sine/cosine function.
-        exchange_xy (bool, optional): exchange pos x and pos y. \
-            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+        pos_tensor (torch.Tensor):
+            Tensor containing positions. Shape: [..., n].
+        num_pos_feats (`int`, *optional*, defaults to 128):
+            Projected shape for each float in the tensor.
+        temperature (`int`, *optional*, defaults to 10000):
+            Temperature in the sine/cosine function.
+        exchange_xy (`bool`, *optional*, defaults to `True`):
+            Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)].
+
     Returns:
-        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+        position_embeddings (torch.Tensor): shape: [..., n * hidden_size].
     """
     scale = 2 * math.pi
     dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
@@ -1189,11 +1194,12 @@ def sine_func(x: torch.Tensor):
         sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
         return sin_x
 
-    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    position_embeddings = [sine_func(x) for x in pos_tensor]
     if exchange_xy:
-        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
-    pos_res = torch.cat(pos_res, dim=-1)
-    return pos_res
+        position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0]
+    position_embeddings = torch.cat(position_embeddings, dim=-1)
+    return position_embeddings
 
 
 class GroundingDinoEncoderLayer(nn.Module):
@@ -1551,6 +1557,89 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
+class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
+    """Grounding DINO text encoder, BERT-like."""
+
+    def __init__(self, config: GroundingDinoTextConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = GroundingDinoTextEmbeddings(config)
+        self.encoder = GroundingDinoTextEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = input_ids.shape
+        batch_size, seq_length = input_shape
+        device = input_ids.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
 GROUNDING_DINO_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -3652,86 +3741,3 @@ def forward(
             attentions=all_self_attentions,
             cross_attentions=all_cross_attentions,
         )
-
-
-class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
-    """Grounding DINO text encoder, BERT-like."""
-
-    def __init__(self, config: GroundingDinoTextConfig):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = GroundingDinoTextEmbeddings(config)
-        self.encoder = GroundingDinoTextEncoder(config)
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_shape = input_ids.shape
-        batch_size, seq_length = input_shape
-        device = input_ids.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-
-        if not return_dict:
-            return (sequence_output,) + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=sequence_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )

From 14852643c5b59a293a992edf8b494b564bb4d941 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 19:51:05 +0100
Subject: [PATCH 174/252] Address comment

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d838da3cc9bd6d..e5a94125f27033 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1213,7 +1213,7 @@ def __init__(self, config) -> None:
         self.deformable_layer = GroundingDinoDeformableLayer(config)
 
     def get_text_position_embeddings(
-        self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor
+        self, text_features: Tensor, text_position_embedding: Optional[torch.Tensor], text_position_ids: Optional[torch.Tensor],
     ) -> Tensor:
         batch_size, seq_length, _ = text_features.shape
         if text_position_embedding is None and text_position_ids is None:

From 36c64beeed73f433b52a0cb7895c84195f138ad3 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 20:01:36 +0100
Subject: [PATCH 175/252] Address more comments

---
 .../grounding_dino/modeling_grounding_dino.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 57fd8b4553669c..4db8951c1fcd3d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1295,7 +1295,10 @@ def __init__(self, config) -> None:
         self.deformable_layer = GroundingDinoDeformableLayer(config)
 
     def get_text_position_embeddings(
-        self, text_features: Tensor, text_position_embedding: Optional[torch.Tensor], text_position_ids: Optional[torch.Tensor],
+        self,
+        text_features: Tensor,
+        text_position_embedding: Optional[torch.Tensor],
+        text_position_ids: Optional[torch.Tensor],
     ) -> Tensor:
         batch_size, seq_length, _ = text_features.shape
         if text_position_embedding is None and text_position_ids is None:
@@ -1985,7 +1988,7 @@ def __init__(self, config: GroundingDinoConfig):
         )
         self.gradient_checkpointing = False
 
-        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR
         self.bbox_embed = None
         self.class_embed = None
         self.query_scale = None
@@ -3122,9 +3125,9 @@ def forward(
         masks = []
         for level, (source, mask) in enumerate(vision_features):
             sources.append(self.input_proj_vision[level](source))
-            masks.append(mask)
             if mask is None:
                 raise ValueError("No attention mask was provided")
+            masks.append(mask)
 
         # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
         if self.config.num_feature_levels > len(sources):
@@ -3206,7 +3209,7 @@ def forward(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
             )
 
-            # hack implementation for two-stage Deformable DETR
+            # hack implementation as in two-stage Deformable DETR
             # apply a detection head to each pixel (A.4 in paper)
             # linear projection for bounding box binary classification (i.e. foreground and background)
             enc_outputs_class = self.encoder_output_class_embed(
@@ -3402,7 +3405,8 @@ def forward(
         outputs_classes = []
         outputs_coords = []
 
-        for level in range(hidden_states.shape[1]):
+        num_levels = hidden_states.shape[1]
+        for level in range(num_levels):
             if level == 0:
                 reference = init_reference_points
             else:
@@ -3414,9 +3418,11 @@ def forward(
                 text_token_mask=attention_mask.bool(),
             )
             delta_bbox = self.bbox_embed[level](hidden_states[:, level])
-            if reference.shape[-1] == 4:
+
+            reference_coordinates = reference.shape[-1]
+            if reference_coordinates == 4:
                 outputs_coord_logits = delta_bbox + reference
-            elif reference.shape[-1] == 2:
+            elif reference_coordinates == 2:
                 delta_bbox[..., :2] += reference
                 outputs_coord_logits = delta_bbox
             else:

From 8f338dd4b5219e272d70bd8b4f00655fd79cb709 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 4 Feb 2024 20:05:41 +0100
Subject: [PATCH 176/252] Address merge

---
 .../grounding_dino/modeling_grounding_dino.py | 893 ++----------------
 1 file changed, 104 insertions(+), 789 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4db8951c1fcd3d..db6f7bfa1134fd 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -751,88 +751,6 @@ def forward(
         return output, attention_weights
 
 
-class GroundingDinoMultiheadAttention(nn.Module):
-    """Equivalent implementation of nn.MultiheadAttention with batch_first=True."""
-
-    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
-        super().__init__()
-        if embed_dim % num_heads != 0:
-            raise ValueError(
-                f"The hidden size ({embed_dim}) is not a multiple of the number of attention " f"heads ({num_heads})"
-            )
-
-        self.num_attention_heads = num_heads
-        self.attention_head_size = int(embed_dim / num_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(embed_dim, self.all_head_size)
-        self.key = nn.Linear(embed_dim, self.all_head_size)
-        self.value = nn.Linear(embed_dim, self.all_head_size)
-
-        self.out_proj = nn.Linear(embed_dim, embed_dim)
-
-        self.dropout = nn.Dropout(dropout)
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        queries: torch.Tensor,
-        keys: torch.Tensor,
-        values: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(queries)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        else:
-            key_layer = self.transpose_for_scores(self.key(keys))
-            value_layer = self.transpose_for_scores(self.value(values))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        context_layer = self.out_proj(context_layer)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
 class GroundingDinoTextEnhancerLayer(nn.Module):
     """Vanilla Transformer with text embeddings as input"""
 
@@ -2086,742 +2004,139 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_attns = () if output_attentions else None
-        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
-        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
-        intermediate = ()
-        intermediate_reference_points = ()
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if reference_points.shape[-1] == 4:
-                reference_points_input = (
-                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
-                )
-            else:
-                if reference_points.shape[-1] != 2:
-                    raise ValueError("Reference points' last dimension must be of size 2")
-                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
-            query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
-            query_pos = self.reference_points_head(query_pos)
-
-            # In original implementation they apply layer norm before outputting intermediate hidden states
-            # Though that's not through between layers so the layers use as input the output of the previous layer
-            # withtout layer norm
-            if output_hidden_states:
-                all_hidden_states += (self.layer_norm(hidden_states),)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    query_pos,
-                    reference_points_input,
-                    spatial_shapes,
-                    level_start_index,
-                    vision_encoder_hidden_states,
-                    vision_encoder_attention_mask,
-                    text_encoder_hidden_states,
-                    text_encoder_attention_mask,
-                    self_attn_mask,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states=hidden_states,
-                    position_embeddings=query_pos,
-                    reference_points=reference_points_input,
-                    spatial_shapes=spatial_shapes,
-                    level_start_index=level_start_index,
-                    vision_encoder_hidden_states=vision_encoder_hidden_states,
-                    vision_encoder_attention_mask=vision_encoder_attention_mask,
-                    text_encoder_hidden_states=text_encoder_hidden_states,
-                    text_encoder_attention_mask=text_encoder_attention_mask,
-                    self_attn_mask=self_attn_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            # hack implementation for iterative bounding box refinement
-            if self.bbox_embed is not None:
-                tmp = self.bbox_embed[idx](hidden_states)
-                if reference_points.shape[-1] == 4:
-                    new_reference_points = tmp + inverse_sigmoid(reference_points)
-                    new_reference_points = new_reference_points.sigmoid()
-                else:
-                    if reference_points.shape[-1] != 2:
-                        raise ValueError(
-                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
-                        )
-                    new_reference_points = tmp
-                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
-                    new_reference_points = new_reference_points.sigmoid()
-                reference_points = new_reference_points.detach()
-
-            intermediate += (self.layer_norm(hidden_states),)
-            intermediate_reference_points += (reference_points,)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-                if text_encoder_hidden_states is not None:
-                    all_cross_attns_text += (layer_outputs[2],)
-
-                if vision_encoder_hidden_states is not None:
-                    all_cross_attns_vision += (layer_outputs[3],)
-
-        # Keep batch_size as first dimension
-        intermediate = torch.stack(intermediate, dim=1)
-        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if output_attentions:
-            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    intermediate,
-                    intermediate_reference_points,
-                    all_hidden_states,
-                    all_attns,
-                ]
-                if v is not None
-            )
-        return GroundingDinoDecoderOutput(
-            last_hidden_state=hidden_states,
-            intermediate_hidden_states=intermediate,
-            intermediate_reference_points=intermediate_reference_points,
-            hidden_states=all_hidden_states,
-            attentions=all_attns,
-        )
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
-class GroundingDinoTextEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values_length: int = 0,
-    ) -> torch.Tensor:
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
-class GroundingDinoTextSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        use_cache = past_key_value is not None
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
-            if use_cache:
-                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-                    -1, 1
-                )
-            else:
-                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
-class GroundingDinoTextSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
-class GroundingDinoTextAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = GroundingDinoTextSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
-class GroundingDinoTextIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
-class GroundingDinoTextOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
-class GroundingDinoTextLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = GroundingDinoTextAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = GroundingDinoTextIntermediate(config)
-        self.output = GroundingDinoTextOutput(config)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`"
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
-class GroundingDinoTextEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        # decoder layers
         all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_self_attns = () if output_attentions else None
+        all_attns = () if output_attentions else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                 )
-                use_cache = False
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
+            query_pos = self.reference_points_head(query_pos)
 
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # withtout layer norm
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
+                all_hidden_states += (self.layer_norm(hidden_states),)
 
             if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
                     hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None,
                 )
             else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
+                layer_outputs = decoder_layer(
+                    hidden_states=hidden_states,
+                    position_embeddings=query_pos,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions,
                 )
 
             hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (self.layer_norm(hidden_states),)
+            intermediate_reference_points += (reference_points,)
+
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+                all_self_attns += (layer_outputs[1],)
+
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
 
+        # add hidden states from the last decoder layer
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+            all_hidden_states += (hidden_states,)
+
+        if output_attentions:
+            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
 
         if not return_dict:
             return tuple(
                 v
                 for v in [
                     hidden_states,
-                    next_decoder_cache,
+                    intermediate,
+                    intermediate_reference_points,
                     all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
+                    all_attns,
                 ]
                 if v is not None
             )
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return GroundingDinoDecoderOutput(
             last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
             hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText
-class GroundingDinoTextPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel):
-    config_class = GroundingDinoTextConfig
-
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = GroundingDinoTextEmbeddings(config)
-        self.encoder = GroundingDinoTextEncoder(config)
-
-        self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
+            attentions=all_attns,
         )
 
 
@@ -4431,4 +3746,4 @@ def forward(
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
             cross_attentions=all_cross_attentions,
-        )
+        )
\ No newline at end of file

From a46c4f0fa4bd886aec819159406dcdcdb99dbf19 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 10:40:04 +0100
Subject: [PATCH 177/252] Address comment

---
 .../grounding_dino/modeling_grounding_dino.py | 21 +++++--------------
 utils/check_repo.py                           |  1 +
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index db6f7bfa1134fd..99298cbaf2fda7 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -375,17 +375,6 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
 
 
-def _get_clones(module, num_copies):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(num_copies)])
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0, max=1)
-    x1 = x.clamp(min=eps)
-    x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1 / x2)
-
-
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
 class GroundingDinoFrozenBatchNorm2d(nn.Module):
     """
@@ -2083,7 +2072,7 @@ def custom_forward(*inputs):
             if self.bbox_embed is not None:
                 tmp = self.bbox_embed[idx](hidden_states)
                 if reference_points.shape[-1] == 4:
-                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
                     new_reference_points = new_reference_points.sigmoid()
                 else:
                     if reference_points.shape[-1] != 2:
@@ -2091,7 +2080,7 @@ def custom_forward(*inputs):
                             f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
                         )
                     new_reference_points = tmp
-                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
                     new_reference_points = new_reference_points.sigmoid()
                 reference_points = new_reference_points.detach()
 
@@ -2621,7 +2610,7 @@ def __init__(self, config: GroundingDinoConfig):
         if config.decoder_bbox_embed_share:
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers)
+            self.bbox_embed = nn.ModuleList(_bbox_embed, config.decoder_layers)
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
         # hack implementation for two-stage
         self.model.decoder.bbox_embed = self.bbox_embed
@@ -2726,7 +2715,7 @@ def forward(
                 reference = init_reference_points
             else:
                 reference = inter_references_points[:, level - 1]
-            reference = inverse_sigmoid(reference)
+            reference = torch.special.logit(reference, eps=1e-5)
             outputs_class = self.class_embed[level](
                 vision_hidden_state=hidden_states[:, level],
                 text_hidden_state=enc_text_hidden_state,
@@ -3746,4 +3735,4 @@ def forward(
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
             cross_attentions=all_cross_attentions,
-        )
\ No newline at end of file
+        )
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2c44949c12188b..bf77d8860d04b0 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -69,6 +69,7 @@
     "Pop2PianoStack",
     "SwitchTransformersStack",
     "TFDPRSpanPredictor",
+    "GroundingDinoTextModel",
     "MaskFormerSwinModel",
     "MaskFormerSwinPreTrainedModel",
     "BridgeTowerTextModel",

From a8a6bea36a4294f746bccc5e0a7ffc2ef8d296b5 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 10:47:25 +0100
Subject: [PATCH 178/252] Address comment

---
 .../grounding_dino/modeling_grounding_dino.py   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 99298cbaf2fda7..b5acd4bb1e1eb9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -505,10 +505,10 @@ class GroundingDinoSinePositionEmbedding(nn.Module):
     need paper, generalized to work on images.
     """
 
-    def __init__(self, embedding_dim=64, temperature=10000):
+    def __init__(self, config):
         super().__init__()
-        self.embedding_dim = embedding_dim
-        self.temperature = temperature
+        self.embedding_dim = config.d_model // 2
+        self.temperature = config.positional_embedding_temperature
         self.scale = 2 * math.pi
 
     def forward(self, pixel_values, pixel_mask):
@@ -529,14 +529,15 @@ def forward(self, pixel_values, pixel_mask):
         return pos
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
 class GroundingDinoLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
 
-    def __init__(self, embedding_dim=256):
+    def __init__(self, config):
         super().__init__()
+
+        embedding_dim = config.d_model // 2
         self.row_embeddings = nn.Embedding(50, embedding_dim)
         self.column_embeddings = nn.Embedding(50, embedding_dim)
 
@@ -554,12 +555,10 @@ def forward(self, pixel_values, pixel_mask=None):
 
 
 def build_position_encoding(config):
-    n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
-        # TODO find a better way of exposing other arguments
-        position_embedding = GroundingDinoSinePositionEmbedding(n_steps, config.positional_embedding_temperature)
+        position_embedding = GroundingDinoSinePositionEmbedding(config)
     elif config.position_embedding_type == "learned":
-        position_embedding = GroundingDinoLearnedPositionEmbedding(n_steps)
+        position_embedding = GroundingDinoLearnedPositionEmbedding(config)
     else:
         raise ValueError(f"Not supported {config.position_embedding_type}")
 

From 28686eca3ebe272f767a5b398e4063525ec6df74 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 10:54:57 +0100
Subject: [PATCH 179/252] Address comment

---
 .../models/grounding_dino/modeling_grounding_dino.py      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b5acd4bb1e1eb9..34d21221fd4b4f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1723,7 +1723,7 @@ def __init__(self, config: GroundingDinoConfig):
     @staticmethod
     def get_reference_points(spatial_shapes, valid_ratios, device):
         """
-        Get reference points for each feature map. Used in decoder.
+        Get reference points for each feature map.
 
         Args:
             spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
@@ -1920,10 +1920,11 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen
         pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
         pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
 
-        if proposals.size(-1) == 2:
+        num_coordinates = proposals.size(-1)
+        if num_coordinates == 2:
             # batch_size, num_queries, num_pos_feats * 2
             pos = torch.cat((pos_y, pos_x), dim=2)
-        elif proposals.size(-1) == 4:
+        elif num_coordinates == 4:
             w_embed = proposals[:, :, 2] * scale
             pos_w = w_embed[:, :, None] / dim_t
             # batch_size, num_queries, num_pos_feats
@@ -2128,6 +2129,7 @@ def custom_forward(*inputs):
         )
 
 
+# these correspond to [CLS], [SEP], . and ?
 SPECIAL_TOKENS = [101, 102, 1012, 1029]
 
 

From a3330aca082bfc24e83aad8d3d72d4598403c0d7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Mon, 5 Feb 2024 10:58:33 +0100
Subject: [PATCH 180/252] Make style

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 tests/models/grounding_dino/test_modeling_grounding_dino.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index db6f7bfa1134fd..43937d39da26e3 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3746,4 +3746,4 @@ def forward(
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
             cross_attentions=all_cross_attentions,
-        )
\ No newline at end of file
+        )
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index d9bfda283570c0..00c7f6ec058f11 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -701,7 +701,7 @@ def test_inference_object_detection_head(self):
             [[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]
         ).to(torch_device)
         expected_logits = torch.tensor(
-            [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
+            [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3))

From 21a1b4be58b7dc849931e6d121384fcbe163a9d8 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Mon, 5 Feb 2024 11:08:34 +0100
Subject: [PATCH 181/252] Added layer norm eps to layer norms

---
 .../configuration_grounding_dino.py           |  4 +++
 .../grounding_dino/modeling_grounding_dino.py | 26 +++++++++----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 5a9b05586da590..5d858d7ca68a63 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -240,6 +240,8 @@ class GroundingDinoConfig(PretrainedConfig):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
 
     Examples:
 
@@ -303,6 +305,7 @@ def __init__(
         two_stage_bbox_embed_share=False,
         positional_embedding_temperature=20,
         init_std=0.02,
+        layer_norm_eps=1e-5,
         **kwargs,
     ):
         if backbone_config is None:
@@ -367,6 +370,7 @@ def __init__(
             raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
         self.positional_embedding_temperature = positional_embedding_temperature
         self.init_std = init_std
+        self.layer_norm_eps = layer_norm_eps
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 34d21221fd4b4f..e260c3f639504e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -276,8 +276,8 @@ class GroundingDinoModelOutput(ModelOutput):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
 
-    init_reference_points: torch.FloatTensor = None
     last_hidden_state: torch.FloatTensor = None
+    init_reference_points: torch.FloatTensor = None
     intermediate_hidden_states: torch.FloatTensor = None
     intermediate_reference_points: torch.FloatTensor = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -752,8 +752,8 @@ def __init__(self, config):
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
         self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
 
-        self.layer_norm_before = nn.LayerNorm(config.d_model)
-        self.layer_norm_after = nn.LayerNorm(config.d_model)
+        self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps)
 
         self.activation = ACT2FN[config.activation_function]
         self.num_heads = config.encoder_attention_heads // 2
@@ -1016,8 +1016,8 @@ def __init__(self, config):
         drop_path = config.fusion_droppath
 
         # pre layer norm
-        self.layer_norm_vision = nn.LayerNorm(config.d_model)
-        self.layer_norm_text = nn.LayerNorm(config.d_model)
+        self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps)
         self.attn = GroundingDinoBiMultiHeadAttention(config)
 
         # add layer scale for training stability
@@ -1080,13 +1080,13 @@ def __init__(self, config: GroundingDinoConfig):
         self.self_attn = GroundingDinoMultiscaleDeformableAttention(
             config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
         )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
 
     def forward(
         self,
@@ -1350,21 +1350,21 @@ def __init__(self, config: GroundingDinoConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         # cross-attention text
         self.encoder_attn_text = GroundingDinoMultiheadAttention(mha_config)
-        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         # cross-attention
         self.encoder_attn = GroundingDinoMultiscaleDeformableAttention(
             config,
             num_heads=config.decoder_attention_heads,
             n_points=config.decoder_n_points,
         )
-        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         # feedforward neural networks
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
 
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
@@ -1887,7 +1887,7 @@ def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
-        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
         self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.reference_points_head = GroundingDinoMLPPredictionHead(
             config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
@@ -2233,7 +2233,7 @@ def __init__(self, config: GroundingDinoConfig):
 
         if config.two_stage:
             self.enc_output = nn.Linear(config.d_model, config.d_model)
-            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
             if (
                 config.two_stage_bbox_embed_share
                 and config.decoder_bbox_embed_share

From 7292639a6aee52db8f46e2f37710d6c58cb44b6a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 11:38:17 +0100
Subject: [PATCH 182/252] Address more comments

---
 docs/source/en/model_doc/grounding-dino.md    | 26 ++++----
 .../grounding_dino/modeling_grounding_dino.py | 61 +------------------
 2 files changed, 17 insertions(+), 70 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 279dcca7a29ee4..e3541064cc2ef7 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -24,23 +24,33 @@ The abstract from the paper is the following:
 
 *In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP.*
 
-Tips:
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Grounding DINO overview. Taken from the <a href="https://arxiv.org/abs/2303.05499">original paper</a>. </small>
+
+This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
+
+## Usage tips
 
 - One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model.
 - To separate classes in the text use a period e.g. "a cat. a dog."
-- When using multiple classes use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs
+- When using multiple classes, use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs.
+
+Here's how to use the model for zero-shot object detection:
 
 ```python
 import requests
 
 import torch
 from PIL import Image
-from transformers import AutoModelForObjectDetection, AutoProcessor
+from transformers import AutoProcessor, AutoModelForObjectDetection, 
 
 model_id = "EduardoPacheco/grounding-dino-tiny"
 
-model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
 processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
 
 def load_image(url):
     return Image.open(requests.get(url, stream=True).raw)
@@ -62,14 +72,6 @@ results = processor.post_process_grounded_object_detection(
 )
 ```
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Grounding DINO overview. Taken from the <a href="https://arxiv.org/abs/2303.05499">original paper</a>. </small>
-
-This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO).
-
 
 ## GroundingDinoImageProcessor
 
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e260c3f639504e..f197eed3754359 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1902,44 +1902,6 @@ def __init__(self, config: GroundingDinoConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTensor:
-        """Get the position embedding of the proposals."""
-        num_pos_feats = self.config.d_model // 2
-        temperature = 10000
-        scale = 2 * math.pi
-
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
-        # batch_size, num_queries
-        pos_x = proposals[:, :, 0] * scale
-        pos_y = proposals[:, :, 1] * scale
-        # batch_size, num_queries, num_pos_feats
-        pos_x = pos_x[:, :, None] / dim_t
-        pos_y = pos_y[:, :, None] / dim_t
-        # batch_size, num_queries, num_pos_feats
-        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
-        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
-
-        num_coordinates = proposals.size(-1)
-        if num_coordinates == 2:
-            # batch_size, num_queries, num_pos_feats * 2
-            pos = torch.cat((pos_y, pos_x), dim=2)
-        elif num_coordinates == 4:
-            w_embed = proposals[:, :, 2] * scale
-            pos_w = w_embed[:, :, None] / dim_t
-            # batch_size, num_queries, num_pos_feats
-            pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
-
-            h_embed = proposals[:, :, 3] * scale
-            pos_h = h_embed[:, :, None] / dim_t
-            # batch_size, num_queries, num_pos_feats
-            pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
-            # batch_size, num_queries, num_pos_feats * 4
-            pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
-        else:
-            raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1)))
-        return pos
-
     def forward(
         self,
         inputs_embeds,
@@ -2016,11 +1978,11 @@ def forward(
                 reference_points_input = (
                     reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                 )
+            elif reference_points.shape[-1] != 2:
+                raise ValueError("Reference points' last dimension must be of size 2")
             else:
-                if reference_points.shape[-1] != 2:
-                    raise ValueError("Reference points' last dimension must be of size 2")
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
-            query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :])
+            query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
             query_pos = self.reference_points_head(query_pos)
 
             # In original implementation they apply layer norm before outputting intermediate hidden states
@@ -2276,23 +2238,6 @@ def get_valid_ratio(self, mask):
         valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
         return valid_ratio
 
-    def get_proposal_pos_embed(self, proposals):
-        """Get the position embedding of the proposals."""
-
-        num_pos_feats = self.config.d_model // 2
-        temperature = 10000
-        scale = 2 * math.pi
-
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
-        # batch_size, num_queries, 4
-        proposals = proposals.sigmoid() * scale
-        # batch_size, num_queries, 4, 128
-        pos = proposals[:, :, :, None] / dim_t
-        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
-        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
-        return pos
-
     def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
         """Generate the encoder output proposals from encoded enc_output.
 

From 6e51931e6636f95403b2216c276f9ea768f3878e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Mon, 5 Feb 2024 11:57:39 +0100
Subject: [PATCH 183/252] More fixes

---
 .../models/grounding_dino/configuration_grounding_dino.py | 2 +-
 .../models/grounding_dino/modeling_grounding_dino.py      | 8 ++++----
 .../models/grounding_dino/test_modeling_grounding_dino.py | 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 5d858d7ca68a63..1828b90730b473 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -240,7 +240,7 @@ class GroundingDinoConfig(PretrainedConfig):
             The temperature for Sine Positional Embedding that is used together with vision backbone.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
 
     Examples:
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f197eed3754359..787f55f0e70e47 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -234,10 +234,10 @@ class GroundingDinoModelOutput(ModelOutput):
     Base class for outputs of the Grounding DINO encoder-decoder model.
 
     Args:
-        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
-            Initial reference points sent through the Transformer decoder.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
         intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
             Stacked intermediate hidden states (output of each layer of the decoder).
         intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
@@ -2514,8 +2514,8 @@ def forward(
             return tuple_outputs
 
         return GroundingDinoModelOutput(
-            init_reference_points=init_reference_points,
             last_hidden_state=decoder_outputs.last_hidden_state,
+            init_reference_points=init_reference_points,
             intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
             intermediate_reference_points=decoder_outputs.intermediate_reference_points,
             decoder_hidden_states=decoder_outputs.hidden_states,
@@ -2648,7 +2648,7 @@ def forward(
 
         hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
         enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
-        init_reference_points = outputs.init_reference_points if return_dict else outputs[0]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
         inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
 
         # class logits + predicted bounding boxes
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 00c7f6ec058f11..b90eadc7e9aa22 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -532,8 +532,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
         outputs = model(**inputs)
 
-        # we take the second output since last_hidden_state is the second item
-        output = outputs[1]
+        output = outputs[0]
 
         encoder_hidden_states = outputs.encoder_vision_hidden_states[0]
         encoder_attentions = outputs.encoder_attentions[0][0]

From d5481bbd7f984bf3b4ca90c7c5e0a51efa1517ac Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Mon, 5 Feb 2024 12:09:50 +0100
Subject: [PATCH 184/252] Fixed equivalence

---
 .../models/grounding_dino/modeling_grounding_dino.py      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 787f55f0e70e47..35fba5e45bd42d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -360,8 +360,8 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     logits: torch.FloatTensor = None
     pred_boxes: torch.FloatTensor = None
     auxiliary_outputs: Optional[List[Dict]] = None
-    init_reference_points: Optional[torch.FloatTensor] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
     intermediate_hidden_states: Optional[torch.FloatTensor] = None
     intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -2509,7 +2509,9 @@ def forward(
 
         if not return_dict:
             enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
-            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+            tuple_outputs = (
+                (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
+            )
 
             return tuple_outputs
 
@@ -2737,8 +2739,8 @@ def forward(
             loss_dict=loss_dict,
             logits=logits,
             pred_boxes=pred_boxes,
-            auxiliary_outputs=auxiliary_outputs,
             last_hidden_state=outputs.last_hidden_state,
+            auxiliary_outputs=auxiliary_outputs,
             decoder_hidden_states=outputs.decoder_hidden_states,
             decoder_attentions=outputs.decoder_attentions,
             encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,

From 098a59d05d6ba40184787d6749a822787ef7b314 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 13:49:33 +0100
Subject: [PATCH 185/252] Make fixup

---
 README_fr.md                                  |  1 +
 .../image_processing_grounding_dino.py        | 42 ++++++-------------
 .../grounding_dino/modeling_grounding_dino.py |  2 +-
 .../test_image_processing_grounding_dino.py   |  4 +-
 4 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/README_fr.md b/README_fr.md
index ebff15fa19b4f3..c033ba6ac3c3fa 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -379,6 +379,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (de BigCode) a été publié dans l'article [SantaCoder: ne visez pas les étoiles !](https://arxiv.org/abs/2301.03988) par Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** a été publié dans le dépôt [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) par Toshiyuki Sakamoto (tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Les Transformers sont-ils vraiment inefficaces pour la représentation graphique ?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (de Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) publié dans l'article [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) parShilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT : la segmentation sémantique émerge de la supervision textuelle](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ : référentiel complet pour la compréhension du langage polonais](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT : Apprentissage de la représentation autonome de la parole par prédiction masquée des unités cachées](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 603f4d05d29d35..649541b682ce65 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -45,9 +45,8 @@
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
     ExplicitEnum,
@@ -82,12 +81,12 @@
 AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
 
-class AnnotionFormat(ExplicitEnum):
+class AnnotationFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
     COCO_PANOPTIC = "coco_panoptic"
 
 
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -766,7 +765,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
     Constructs a Grounding DINO image processor.
 
     Args:
-        format (`str`, *optional*, defaults to `AnnotionFormat.COCO_DETECTION`):
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
             Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
@@ -802,7 +801,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -861,7 +860,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -871,12 +870,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1118,7 +1117,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1164,7 +1163,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1231,28 +1230,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 35fba5e45bd42d..462d3f5121a634 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -644,7 +644,7 @@ def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
 
     def _reset_parameters(self):
         nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        thetas = torch.arange(self.n_heads, dtype=torch.int64).float() * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = (
             (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 466c6825a257ae..7641a63ea289c6 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -129,7 +129,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_torch
 @require_vision
 # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDino
-class GroundingDinoImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
 
     def setUp(self):

From e005007f105acbaf17574dbd7934d36113d800b2 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 14:00:33 +0100
Subject: [PATCH 186/252] Remove print statements

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index b90eadc7e9aa22..6b9a39980e716a 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -423,22 +423,18 @@ def recursive_check(tuple_object, dict_object):
             model.to(torch_device)
             model.eval()
 
-            print("Done 1")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-            print("Done 2")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-            print("Done 3")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-            print("Done 4")
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})

From daa29dc7b288ec0754cfd64aa95cfc437554302a Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Mon, 5 Feb 2024 14:09:46 +0100
Subject: [PATCH 187/252] Address comments

---
 .../grounding_dino/modeling_grounding_dino.py | 20 +++++++++----------
 .../processing_grounding_dino.py              |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 462d3f5121a634..97ec07e9d353ad 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1655,14 +1655,6 @@ def forward(
             Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for
             details.
 
-        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
-            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
-
-            - 1 for pixels that are real (i.e. **not masked**),
-            - 0 for pixels that are padding (i.e. **masked**).
-
-            [What are attention masks?](../glossary#attention-mask)
-
         input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
@@ -1683,6 +1675,14 @@ def forward(
 
             [What are token type IDs?](../glossary#token-type-ids)
 
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
             `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`)
@@ -2296,8 +2296,8 @@ def forward(
         self,
         pixel_values: Tensor,
         input_ids: Tensor,
-        token_type_ids: Tensor = None,
-        attention_mask: Tensor = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
         pixel_mask: Optional[Tensor] = None,
         encoder_outputs=None,
         output_attentions=None,
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index d395d9b0ce6043..4d336987e4cc73 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -187,8 +187,8 @@ def post_process_grounded_object_detection(
         self,
         outputs,
         input_ids,
-        box_threshold: float,
-        text_threshold: float,
+        box_threshold: float = 0.25,
+        text_threshold: float = 0.25,
         target_sizes: Union[TensorType, List[Tuple]] = None,
     ):
         """

From 7d4c7630951a3f1e014f9dc27f170944578b4aeb Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 7 Feb 2024 22:57:10 +0100
Subject: [PATCH 188/252] Address comments

---
 .../modeling_deformable_detr.py               | 27 ++++++------
 .../grounding_dino/modeling_grounding_dino.py | 42 +++++++++----------
 .../test_modeling_grounding_dino.py           |  4 --
 3 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 001d379e9a1324..9be05eae99589a 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -676,13 +676,14 @@ def forward(
             batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
         )
         # batch_size, num_queries, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
             offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
             sampling_locations = (
                 reference_points[:, :, None, :, None, :]
                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
             )
-        elif reference_points.shape[-1] == 4:
+        elif num_coordinates == 4:
             sampling_locations = (
                 reference_points[:, :, None, :, None, :2]
                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
@@ -1367,14 +1368,15 @@ def forward(
         intermediate_reference_points = ()
 
         for idx, decoder_layer in enumerate(self.layers):
-            if reference_points.shape[-1] == 4:
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
                 reference_points_input = (
                     reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                 )
-            else:
-                if reference_points.shape[-1] != 2:
-                    raise ValueError("Reference points' last dimension must be of size 2")
+            elif reference_points.shape[-1] == 2:
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Reference points' last dimension must be of size 2")
 
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1408,17 +1410,18 @@ def forward(
             # hack implementation for iterative bounding box refinement
             if self.bbox_embed is not None:
                 tmp = self.bbox_embed[idx](hidden_states)
-                if reference_points.shape[-1] == 4:
+                num_coordinates = reference_points.shape[-1]
+                if num_coordinates == 4:
                     new_reference_points = tmp + inverse_sigmoid(reference_points)
                     new_reference_points = new_reference_points.sigmoid()
-                else:
-                    if reference_points.shape[-1] != 2:
-                        raise ValueError(
-                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
-                        )
+                elif num_coordinates == 2:
                     new_reference_points = tmp
                     new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
                     new_reference_points = new_reference_points.sigmoid()
+                else:
+                    raise ValueError(
+                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                    )
                 reference_points = new_reference_points.detach()
 
             intermediate += (hidden_states,)
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 97ec07e9d353ad..4ff56ae8cca24e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -703,13 +703,14 @@ def forward(
             batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
         )
         # batch_size, num_queries, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
             offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
             sampling_locations = (
                 reference_points[:, :, None, :, None, :]
                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
             )
-        elif reference_points.shape[-1] == 4:
+        elif num_coordinates == 4:
             sampling_locations = (
                 reference_points[:, :, None, :, None, :2]
                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
@@ -744,9 +745,9 @@ class GroundingDinoTextEnhancerLayer(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        mha_config = copy.deepcopy(config)
-        mha_config.num_attention_heads = config.encoder_attention_heads // 2
-        self.self_attn = GroundingDinoMultiheadAttention(mha_config)
+        self.self_attn = GroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.encoder_attention_heads // 2
+        )
 
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
@@ -1271,16 +1272,16 @@ def forward(
 class GroundingDinoMultiheadAttention(nn.Module):
     """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
 
-    def __init__(self, config):
+    def __init__(self, config, num_attention_heads=None):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+        if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
+                f"heads ({num_attention_heads})"
             )
 
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(config.hidden_size / num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
@@ -1342,9 +1343,7 @@ def __init__(self, config: GroundingDinoConfig):
         self.embed_dim = config.d_model
 
         # self-attention
-        mha_config = copy.deepcopy(config)
-        mha_config.num_attention_heads = config.decoder_attention_heads
-        self.self_attn = GroundingDinoMultiheadAttention(mha_config)
+        self.self_attn = GroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads)
 
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1352,7 +1351,9 @@ def __init__(self, config: GroundingDinoConfig):
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         # cross-attention text
-        self.encoder_attn_text = GroundingDinoMultiheadAttention(mha_config)
+        self.encoder_attn_text = GroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.decoder_attention_heads
+        )
         self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
         # cross-attention
         self.encoder_attn = GroundingDinoMultiscaleDeformableAttention(
@@ -1974,14 +1975,15 @@ def forward(
         intermediate_reference_points = ()
 
         for idx, decoder_layer in enumerate(self.layers):
-            if reference_points.shape[-1] == 4:
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
                 reference_points_input = (
                     reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
                 )
-            elif reference_points.shape[-1] != 2:
-                raise ValueError("Reference points' last dimension must be of size 2")
-            else:
+            elif num_coordinates == 2:
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Reference points' last dimension must be of size 2")
             query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
             query_pos = self.reference_points_head(query_pos)
 
@@ -2645,11 +2647,9 @@ def forward(
             return_dict=return_dict,
         )
 
-        # index for encoder_last_hidden_state_text
         idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
-
-        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
         enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
         init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
         inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
 
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 6b9a39980e716a..18b12b0220b431 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -269,10 +269,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="Grounding DINO is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
     @unittest.skip(reason="Grounding DINO does not use token embeddings")
     def test_resize_tokens_embeddings(self):
         pass

From f52dd2d45a3f9896d70382116baabb8dc88cc949 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 7 Feb 2024 23:04:25 +0100
Subject: [PATCH 189/252] Address comments

---
 .../grounding_dino/modeling_grounding_dino.py | 22 +++++++-------
 .../test_modeling_grounding_dino.py           | 29 +++++++------------
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4ff56ae8cca24e..1a5d11b07aaeb1 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2035,17 +2035,18 @@ def custom_forward(*inputs):
             # hack implementation for iterative bounding box refinement
             if self.bbox_embed is not None:
                 tmp = self.bbox_embed[idx](hidden_states)
-                if reference_points.shape[-1] == 4:
+                num_coordinates = reference_points.shape[-1]
+                if num_coordinates == 4:
                     new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
                     new_reference_points = new_reference_points.sigmoid()
-                else:
-                    if reference_points.shape[-1] != 2:
-                        raise ValueError(
-                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
-                        )
+                elif num_coordinates == 2:
                     new_reference_points = tmp
                     new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
                     new_reference_points = new_reference_points.sigmoid()
+                else:
+                    raise ValueError(
+                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                    )
                 reference_points = new_reference_points.detach()
 
             intermediate += (self.layer_norm(hidden_states),)
@@ -2240,7 +2241,7 @@ def get_valid_ratio(self, mask):
         valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
         return valid_ratio
 
-    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+    def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
         """Generate the encoder output proposals from encoded enc_output.
 
         Args:
@@ -2259,9 +2260,8 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
         proposals = []
         current_position = 0
         for level, (height, width) in enumerate(spatial_shapes):
-            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(
-                batch_size, height, width, 1
-            )
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
+            mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
             valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
             valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 
@@ -2457,7 +2457,7 @@ def forward(
         enc_outputs_class = None
         enc_outputs_coord_logits = None
         if self.config.two_stage:
-            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+            object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
             )
 
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 18b12b0220b431..fa4ef8871de469 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -209,7 +209,7 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
     test_head_masking = False
     test_missing_keys = False
     pipeline_model_mapping = (
-        {"feature-extraction": GroundingDinoModel, "object-detection": GroundingDinoForObjectDetection}
+        {"image-feature-extraction": GroundingDinoModel, "object-detection": GroundingDinoForObjectDetection}
         if is_torch_available()
         else {}
     )
@@ -356,13 +356,7 @@ def test_attention_outputs(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 3
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self.assertEqual(out_len + 3, len(outputs))
 
             self_attentions = outputs.encoder_attentions[-1]
 
@@ -483,18 +477,17 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                 [seq_len, self.model_tester.hidden_size],
             )
 
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
+            hidden_states = outputs.decoder_hidden_states
 
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+            self.assertIsInstance(hidden_states, (list, tuple))
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            seq_len = getattr(self.model_tester, "seq_length", None)
+            decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
 
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [decoder_seq_length, self.model_tester.hidden_size],
+            )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From afb5c6e65c0c6dd95c119d356c79fa7a7bbf9858 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 8 Feb 2024 22:34:27 +0100
Subject: [PATCH 190/252] Address comments

---
 .../models/grounding_dino/modeling_grounding_dino.py      | 2 --
 .../models/grounding_dino/test_modeling_grounding_dino.py | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 1a5d11b07aaeb1..12ebfa530449b9 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2377,8 +2377,6 @@ def forward(
         masks = []
         for level, (source, mask) in enumerate(vision_features):
             sources.append(self.input_proj_vision[level](source))
-            if mask is None:
-                raise ValueError("No attention mask was provided")
             masks.append(mask)
 
         # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index fa4ef8871de469..eada8120ca9f3e 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -70,15 +70,15 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        num_queries=12,
+        num_queries=2,
         num_channels=3,
-        image_size=64,
+        image_size=10,
         n_targets=8,
-        num_labels=91,
+        num_labels=3,
         num_feature_levels=4,
         encoder_n_points=2,
         decoder_n_points=6,
-        max_text_len=256,
+        max_text_len=7,
     ):
         self.parent = parent
         self.batch_size = batch_size

From 4a88014e927f2c05eb0426f6d4914d28a1c4c4f0 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Thu, 8 Feb 2024 22:47:33 +0100
Subject: [PATCH 191/252] Address comments

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index eada8120ca9f3e..d30b79a442c299 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -597,6 +597,7 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->GroundingDino
     def test_two_stage_training(self):
         model_class = GroundingDinoForObjectDetection
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -734,7 +735,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
 
         # 2. run model on GPU
         model.to("cuda")
-        encoding = {key: value.to("cuda") for key, value in encoding.items()}
+        encoding = encoding.to("cuda")
         with torch.no_grad():
             gpu_outputs = model(**encoding)
 

From 34e37b43263666f7a0b233301644c8e46656a656 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 9 Feb 2024 21:49:20 +0100
Subject: [PATCH 192/252] Address comments

---
 .../grounding_dino/modeling_grounding_dino.py | 3252 ++++++++---------
 .../test_modeling_grounding_dino.py           |   14 +-
 2 files changed, 1635 insertions(+), 1631 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 12ebfa530449b9..f42e74b4f276ba 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1549,1060 +1549,1247 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
-class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
-    """Grounding DINO text encoder, BERT-like."""
-
-    def __init__(self, config: GroundingDinoTextConfig):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = GroundingDinoTextEmbeddings(config)
-        self.encoder = GroundingDinoTextEncoder(config)
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
+class GroundingDinoTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
 
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
 
     def forward(
         self,
-        input_ids: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
 
-        input_shape = input_ids.shape
-        batch_size, seq_length = input_shape
-        device = input_ids.device
+        seq_length = input_shape[1]
 
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
         if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                 token_type_ids = buffered_token_type_ids_expanded
             else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
 
-        if not return_dict:
-            return (sequence_output,) + encoder_outputs[1:]
 
-        return BaseModelOutput(
-            last_hidden_state=sequence_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
+class GroundingDinoTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
 
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-GROUNDING_DINO_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
 
-    Parameters:
-        config ([`GroundingDinoConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
+        self.is_decoder = config.is_decoder
 
-GROUNDING_DINO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it.
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
 
-            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for
-            details.
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
 
-        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
 
-        attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+        query_layer = self.transpose_for_scores(mixed_query_layer)
 
-            - 1 for tokens that are real (i.e. **not masked**),
-            - 0 for tokens that are padding (i.e. **masked**).
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
 
-            [What are attention masks?](../glossary#attention-mask)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
 
-            [What are token type IDs?](../glossary#token-type-ids)
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
 
-        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
-            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-            - 1 for pixels that are real (i.e. **not masked**),
-            - 0 for pixels that are padding (i.e. **masked**).
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
 
-            [What are attention masks?](../glossary#attention-mask)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
-            `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`)
-            `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence
-            of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
-            decoder.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
 
-class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
-    [`GroundingDinoEncoderLayer`].
+        context_layer = torch.matmul(attention_probs, value_layer)
 
-    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
 
-    Args:
-        config: GroundingDinoConfig
-    """
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-    def __init__(self, config: GroundingDinoConfig):
-        super().__init__(config)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
 
-        self.dropout = config.dropout
-        self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # Initialize weights and apply final processing
-        self.post_init()
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
+class GroundingDinoTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios, device):
-        """
-        Get reference points for each feature map.
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
-        Args:
-            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
-                Spatial shapes of each feature map.
-            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
-                Valid ratios of each feature map.
-            device (`torch.device`):
-                Device on which to create the tensors.
-        Returns:
-            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
-        """
-        reference_points_list = []
-        for level, (height, width) in enumerate(spatial_shapes):
-            ref_y, ref_x = meshgrid(
-                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
-                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
-                indexing="ij",
-            )
-            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
-            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
-            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
-            ref = torch.stack((ref_x, ref_y), -1)
-            reference_points_list.append(ref)
-        reference_points = torch.cat(reference_points_list, 1)
-        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
-        return reference_points
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
+class GroundingDinoTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = GroundingDinoTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
         self,
-        vision_features: Tensor,
-        vision_attention_mask: Tensor,
-        vision_position_embedding: Tensor,
-        spatial_shapes: Tensor,
-        level_start_index: Tensor,
-        valid_ratios=None,
-        text_features: Optional[Tensor] = None,
-        text_attention_mask: Optional[Tensor] = None,
-        text_position_embedding: Optional[Tensor] = None,
-        text_self_attention_masks: Optional[Tensor] = None,
-        text_position_ids: Optional[Tensor] = None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
-            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
-                - 0 for pixel features that are real (i.e. **not masked**),
-                - 1 for pixel features that are padding (i.e. **masked**).
-                [What are attention masks?](../glossary#attention-mask)
-            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
-            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
-                Spatial shapes of each feature map.
-            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
-                Starting index of each feature map.
-            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
-                Ratio of valid area in each feature level.
-            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
-                Flattened text features that are passed to the encoder.
-            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
-                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
-                - 0 for text features that are real (i.e. **not masked**),
-                - 1 for text features that are padding (i.e. **masked**).
-                [What are attention masks?](../glossary#attention-mask)
-            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
-            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
-                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
-                - 1 for text features that are real (i.e. **not masked**),
-                - 0 for text features that are padding (i.e. **masked**).
-            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
-                Position ids for text features.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
 
-        encoder_vision_states = () if output_hidden_states else None
-        encoder_text_states = () if output_hidden_states else None
-        all_attns = () if output_attentions else None
-        all_attn_fused_text = () if output_attentions else None
-        all_attn_fused_vision = () if output_attentions else None
-        all_attn_enhanced_text = () if output_attentions else None
-        all_attn_deformable = () if output_attentions else None
-        for i, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_vision_states += (vision_features,)
-                encoder_text_states += (text_features,)
 
-            (vision_features, text_features), attentions = encoder_layer(
-                vision_features=vision_features,
-                vision_position_embedding=vision_position_embedding,
-                spatial_shapes=spatial_shapes,
-                level_start_index=level_start_index,
-                key_padding_mask=vision_attention_mask,
-                reference_points=reference_points,
-                text_features=text_features,
-                text_attention_mask=text_attention_mask,
-                text_position_embedding=text_position_embedding,
-                text_self_attention_masks=text_self_attention_masks,
-                text_position_ids=text_position_ids,
-            )
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
+class GroundingDinoTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
 
-            if output_attentions:
-                all_attn_fused_vision += (attentions[0],)
-                all_attn_fused_text += (attentions[1],)
-                all_attn_enhanced_text += (attentions[2],)
-                all_attn_deformable += (attentions[3],)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
 
-        if output_hidden_states:
-            encoder_vision_states += (vision_features,)
-            encoder_text_states += (text_features,)
 
-        if output_attentions:
-            all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
+class GroundingDinoTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        if not return_dict:
-            enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
-            return tuple(v for v in enc_outputs if v is not None)
-        return GroundingDinoEncoderOutput(
-            last_hidden_state_vision=vision_features,
-            last_hidden_state_text=text_features,
-            vision_hidden_states=encoder_vision_states,
-            text_hidden_states=encoder_text_states,
-            attentions=all_attns,
-        )
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 
 
-class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`].
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
+class GroundingDinoTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GroundingDinoTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = GroundingDinoTextIntermediate(config)
+        self.output = GroundingDinoTextOutput(config)
 
-    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
 
-    Some tweaks for Grounding DINO:
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
-    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
-    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
 
-    Args:
-        config: GroundingDinoConfig
-    """
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
 
-    def __init__(self, config: GroundingDinoConfig):
-        super().__init__(config)
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
 
-        self.dropout = config.dropout
-        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
-        self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.reference_points_head = GroundingDinoMLPPredictionHead(
-            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
         )
-        self.gradient_checkpointing = False
+        outputs = (layer_output,) + outputs
 
-        # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR
-        self.bbox_embed = None
-        self.class_embed = None
-        self.query_scale = None
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
 
-        # Initialize weights and apply final processing
-        self.post_init()
+        return outputs
 
-    def forward(
-        self,
-        inputs_embeds,
-        vision_encoder_hidden_states,
-        vision_encoder_attention_mask=None,
-        text_encoder_hidden_states=None,
-        text_encoder_attention_mask=None,
-        reference_points=None,
-        spatial_shapes=None,
-        level_start_index=None,
-        valid_ratios=None,
-        self_attn_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
-                The query embeddings that are passed into the decoder.
-            vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Last hidden state from encoder related to vision feature map.
-            vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
-                - 1 for pixel features that are real (i.e. **not masked**),
-                - 0 for pixel features that are padding (i.e. **masked**).
-            text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
-                Last hidden state from encoder related to text features.
-            text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
-                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
-                - 0 for text features that are real (i.e. **not masked**),
-                - 1 for text features that are padding (i.e. **masked**).
-            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
-                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
-            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
-                Spatial shapes of the feature maps.
-            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
-                Indexes for the start of each feature level. In range `[0, sequence_length]`.
-            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
-                Ratio of valid area in each feature level.
-            self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`):
-                Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`:
-                - 1 for queries that are real (i.e. **not masked**),
-                - 0 for queries that are padding (i.e. **masked**).
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
 
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
 
-        # decoder layers
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
+class GroundingDinoTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_attns = () if output_attentions else None
-        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
-        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
-        intermediate = ()
-        intermediate_reference_points = ()
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
-        for idx, decoder_layer in enumerate(self.layers):
-            num_coordinates = reference_points.shape[-1]
-            if num_coordinates == 4:
-                reference_points_input = (
-                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                 )
-            elif num_coordinates == 2:
-                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
-            else:
-                raise ValueError("Reference points' last dimension must be of size 2")
-            query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
-            query_pos = self.reference_points_head(query_pos)
+                use_cache = False
 
-            # In original implementation they apply layer norm before outputting intermediate hidden states
-            # Though that's not through between layers so the layers use as input the output of the previous layer
-            # withtout layer norm
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
-                all_hidden_states += (self.layer_norm(hidden_states),)
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
 
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
-                    return custom_forward
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
                     hidden_states,
-                    query_pos,
-                    reference_points_input,
-                    spatial_shapes,
-                    level_start_index,
-                    vision_encoder_hidden_states,
-                    vision_encoder_attention_mask,
-                    text_encoder_hidden_states,
-                    text_encoder_attention_mask,
-                    self_attn_mask,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states=hidden_states,
-                    position_embeddings=query_pos,
-                    reference_points=reference_points_input,
-                    spatial_shapes=spatial_shapes,
-                    level_start_index=level_start_index,
-                    vision_encoder_hidden_states=vision_encoder_hidden_states,
-                    vision_encoder_attention_mask=vision_encoder_attention_mask,
-                    text_encoder_hidden_states=text_encoder_hidden_states,
-                    text_encoder_attention_mask=text_encoder_attention_mask,
-                    self_attn_mask=self_attn_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            # hack implementation for iterative bounding box refinement
-            if self.bbox_embed is not None:
-                tmp = self.bbox_embed[idx](hidden_states)
-                num_coordinates = reference_points.shape[-1]
-                if num_coordinates == 4:
-                    new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
-                    new_reference_points = new_reference_points.sigmoid()
-                elif num_coordinates == 2:
-                    new_reference_points = tmp
-                    new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
-                    new_reference_points = new_reference_points.sigmoid()
-                else:
-                    raise ValueError(
-                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
-                    )
-                reference_points = new_reference_points.detach()
-
-            intermediate += (self.layer_norm(hidden_states),)
-            intermediate_reference_points += (reference_points,)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-                if text_encoder_hidden_states is not None:
-                    all_cross_attns_text += (layer_outputs[2],)
-
-                if vision_encoder_hidden_states is not None:
-                    all_cross_attns_vision += (layer_outputs[3],)
-
-        # Keep batch_size as first dimension
-        intermediate = torch.stack(intermediate, dim=1)
-        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if output_attentions:
-            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    intermediate,
-                    intermediate_reference_points,
+                    next_decoder_cache,
                     all_hidden_states,
-                    all_attns,
+                    all_self_attentions,
+                    all_cross_attentions,
                 ]
                 if v is not None
             )
-        return GroundingDinoDecoderOutput(
+        return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
-            intermediate_hidden_states=intermediate,
-            intermediate_reference_points=intermediate_reference_points,
+            past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
-            attentions=all_attns,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
         )
 
 
-# these correspond to [CLS], [SEP], . and ?
-SPECIAL_TOKENS = [101, 102, 1012, 1029]
+class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
+    """Grounding DINO text encoder, BERT-like."""
 
+    def __init__(self, config: GroundingDinoTextConfig):
+        super().__init__(config)
+        self.config = config
 
-def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
-    """Generate attention mask between each pair of special tokens and positional ids.
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-    Returns:
-        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
-        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
-        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
-    """
-    batch_size, num_token = input_ids.shape
-    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
-    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
-    for special_token in SPECIAL_TOKENS:
-        special_tokens_mask |= input_ids == special_token
+        self.embeddings = GroundingDinoTextEmbeddings(config)
+        self.encoder = GroundingDinoTextEncoder(config)
 
-    # idxs: each row is a list of indices of special tokens
-    idxs = torch.nonzero(special_tokens_mask)
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
 
-    # generate attention mask and positional ids
-    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
-    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
-    previous_col = 0
-    for i in range(idxs.shape[0]):
-        row, col = idxs[i]
-        if (col == 0) or (col == num_token - 1):
-            attention_mask[row, col, col] = True
-            position_ids[row, col] = 0
-        else:
-            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
-            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
-                0, col - previous_col, device=input_ids.device
-            )
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
 
-        previous_col = col
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
 
-    return attention_mask, position_ids.to(torch.long)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        input_shape = input_ids.shape
+        batch_size, seq_length = input_shape
+        device = input_ids.device
 
-@add_start_docstrings(
-    """
-    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
-    hidden-states without any specific head on top.
-    """,
-    GROUNDING_DINO_START_DOCSTRING,
-)
-class GroundingDinoModel(GroundingDinoPreTrainedModel):
-    def __init__(self, config: GroundingDinoConfig):
-        super().__init__(config)
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
 
-        # Create backbone + positional encoding
-        backbone = GroundingDinoConvEncoder(config)
-        position_embeddings = build_position_encoding(config)
-        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
-        # Create input projection layers
-        if config.num_feature_levels > 1:
-            num_backbone_outs = len(backbone.intermediate_channel_sizes)
-            input_proj_list = []
-            for i in range(num_backbone_outs):
-                in_channels = backbone.intermediate_channel_sizes[i]
-                input_proj_list.append(
-                    nn.Sequential(
-                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                )
-            for _ in range(config.num_feature_levels - num_backbone_outs):
-                input_proj_list.append(
-                    nn.Sequential(
-                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                )
-                in_channels = config.d_model
-            self.input_proj_vision = nn.ModuleList(input_proj_list)
-        else:
-            self.input_proj_vision = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
-                        nn.GroupNorm(32, config.d_model),
-                    )
-                ]
-            )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
 
-        # Create text backbone
-        self.text_backbone = GroundingDinoTextModel(config.text_config)
-        self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
 
-        if config.embedding_init_target or not config.two_stage:
-            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
 
-        self.encoder = GroundingDinoEncoder(config)
-        self.decoder = GroundingDinoDecoder(config)
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
 
-        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
 
-        if config.two_stage:
-            self.enc_output = nn.Linear(config.d_model, config.d_model)
-            self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
-            if (
-                config.two_stage_bbox_embed_share
-                and config.decoder_bbox_embed_share
-                and self.decoder.bbox_embed is not None
-            ):
-                self.encoder_output_bbox_embed = self.decoder.bbox_embed
-            else:
-                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
-                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
-                )
+GROUNDING_DINO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
 
-            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
-        else:
-            self.reference_points = nn.Embedding(config.num_queries, 4)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
 
-        self.post_init()
+    Parameters:
+        config ([`GroundingDinoConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
 
-    def get_encoder(self):
-        return self.encoder
+GROUNDING_DINO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
 
-    def get_decoder(self):
-        return self.decoder
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for
+            details.
 
-    def freeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
-            param.requires_grad_(False)
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
 
-    def unfreeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
-            param.requires_grad_(True)
+            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
 
-    def get_valid_ratio(self, mask):
-        """Get the valid ratio of all feature maps."""
+        attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
-        _, height, width = mask.shape
-        valid_height = torch.sum(mask[:, :, 0], 1)
-        valid_width = torch.sum(mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.float() / height
-        valid_ratio_width = valid_width.float() / width
-        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
-        return valid_ratio
+            - 1 for tokens that are real (i.e. **not masked**),
+            - 0 for tokens that are padding (i.e. **masked**).
 
-    def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
-        """Generate the encoder output proposals from encoded enc_output.
+            [What are attention masks?](../glossary#attention-mask)
 
-        Args:
-            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
-            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
-            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
 
+            [What are attention masks?](../glossary#attention-mask)
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*:
+            `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`)
+            `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence
+            of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
+            decoder.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`GroundingDinoEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: GroundingDinoConfig
+    """
+
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
         Returns:
-            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
-                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
-                  directly predict a bounding box. (without the need of a decoder)
-                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
-                  sigmoid.
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
         """
-        batch_size = enc_output.shape[0]
-        proposals = []
-        current_position = 0
+        reference_points_list = []
         for level, (height, width) in enumerate(spatial_shapes):
-            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
-            mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
-            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
-            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
-
-            grid_y, grid_x = meshgrid(
-                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
-                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
                 indexing="ij",
             )
-            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
-
-            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
-            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
-            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
-            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
-            proposals.append(proposal)
-            current_position += height * width
-
-        output_proposals = torch.cat(proposals, 1)
-        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
-        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
-        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
-        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
-
-        # assign each pixel as an object query
-        object_query = enc_output
-        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
-        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
-        object_query = self.enc_output_norm(self.enc_output(object_query))
-        return object_query, output_proposals
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
 
-    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values: Tensor,
-        input_ids: Tensor,
-        token_type_ids: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        pixel_mask: Optional[Tensor] = None,
-        encoder_outputs=None,
+        vision_features: Tensor,
+        vision_attention_mask: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
+        valid_ratios=None,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
         r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> text = "a cat."
-
-        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = AutoModel.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-
-        >>> inputs = processor(images=image, text=text, return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> last_hidden_states = outputs.last_hidden_state
-        >>> list(last_hidden_states.shape)
-        [1, 900, 256]
-        ```"""
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 0 for pixel features that are real (i.e. **not masked**),
+                - 1 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Flattened text features that are passed to the encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
+                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
+                Position ids for text features.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
-
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
 
-        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
-
-        max_text_len = self.config.max_text_len
-        if text_self_attention_masks.shape[1] > max_text_len:
-            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-            position_ids = position_ids[:, :max_text_len]
-            input_ids = input_ids[:, :max_text_len]
-            token_type_ids = token_type_ids[:, :max_text_len]
-            text_token_mask = text_token_mask[:, :max_text_len]
-
-        # Extract text features from text backbone
-        text_outputs = self.text_backbone(
-            input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
-        )
-        text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
-        text_features = self.text_projection(text_features)
-
-        batch_size, num_channels, height, width = pixel_values.shape
-        device = pixel_values.device
-
-        if pixel_mask is None:
-            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
-
-        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
-        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
-        # which is a list of tuples
-        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
-
-        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
-        sources = []
-        masks = []
-        for level, (source, mask) in enumerate(vision_features):
-            sources.append(self.input_proj_vision[level](source))
-            masks.append(mask)
-
-        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
-        if self.config.num_feature_levels > len(sources):
-            _len_sources = len(sources)
-            for level in range(_len_sources, self.config.num_feature_levels):
-                if level == _len_sources:
-                    source = self.input_proj_vision[level](vision_features[-1][0])
-                else:
-                    source = self.input_proj_vision[level](sources[-1])
-                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
-                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
-                sources.append(source)
-                masks.append(mask)
-                position_embeddings_list.append(pos_l)
-
-        # Create queries
-        query_embeds = None
-        if self.config.embedding_init_target or self.config.two_stage:
-            query_embeds = self.query_position_embeddings.weight
-
-        # Prepare encoder inputs (by flattening)
-        source_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
-            batch_size, num_channels, height, width = source.shape
-            spatial_shape = (height, width)
-            spatial_shapes.append(spatial_shape)
-            source = source.flatten(2).transpose(1, 2)
-            mask = mask.flatten(1)
-            pos_embed = pos_embed.flatten(2).transpose(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            source_flatten.append(source)
-            mask_flatten.append(mask)
-        source_flatten = torch.cat(source_flatten, 1)
-        mask_flatten = torch.cat(mask_flatten, 1)
-        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
-        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
-        valid_ratios = valid_ratios.float()
+        encoder_vision_states = () if output_hidden_states else None
+        encoder_text_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        all_attn_fused_text = () if output_attentions else None
+        all_attn_fused_vision = () if output_attentions else None
+        all_attn_enhanced_text = () if output_attentions else None
+        all_attn_deformable = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_vision_states += (vision_features,)
+                encoder_text_states += (text_features,)
 
-        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
-        # Also provide spatial_shapes, level_start_index and valid_ratios
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                vision_features=source_flatten,
-                vision_attention_mask=~mask_flatten,
-                vision_position_embedding=lvl_pos_embed_flatten,
+            (vision_features, text_features), attentions = encoder_layer(
+                vision_features=vision_features,
+                vision_position_embedding=vision_position_embedding,
                 spatial_shapes=spatial_shapes,
                 level_start_index=level_start_index,
-                valid_ratios=valid_ratios,
+                key_padding_mask=vision_attention_mask,
+                reference_points=reference_points,
                 text_features=text_features,
-                text_attention_mask=~text_token_mask,
-                text_position_embedding=None,
+                text_attention_mask=text_attention_mask,
+                text_position_embedding=text_position_embedding,
                 text_self_attention_masks=text_self_attention_masks,
-                text_position_ids=position_ids,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
-            encoder_outputs = GroundingDinoEncoderOutput(
-                last_hidden_state_vision=encoder_outputs[0],
-                last_hidden_state_text=encoder_outputs[1],
-                vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-                text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
-                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
-            )
-
-        # Fifth, prepare decoder inputs
-        enc_outputs_class = None
-        enc_outputs_coord_logits = None
-        if self.config.two_stage:
-            object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
-                encoder_outputs[0], ~mask_flatten, spatial_shapes
+                text_position_ids=text_position_ids,
             )
 
-            # hack implementation as in two-stage Deformable DETR
-            # apply a detection head to each pixel (A.4 in paper)
-            # linear projection for bounding box binary classification (i.e. foreground and background)
-            enc_outputs_class = self.encoder_output_class_embed(
-                object_query_embedding, encoder_outputs[1], text_token_mask
-            )
-            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
-            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
-            enc_outputs_coord_logits = delta_bbox + output_proposals
+            if output_attentions:
+                all_attn_fused_vision += (attentions[0],)
+                all_attn_fused_text += (attentions[1],)
+                all_attn_enhanced_text += (attentions[2],)
+                all_attn_deformable += (attentions[3],)
 
-            # only keep top scoring `config.num_queries` proposals
-            topk = self.config.num_queries
-            topk_logits = enc_outputs_class.max(-1)[0]
-            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
-            topk_coords_logits = torch.gather(
-                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
-            )
+        if output_hidden_states:
+            encoder_vision_states += (vision_features,)
+            encoder_text_states += (text_features,)
 
-            topk_coords_logits = topk_coords_logits.detach()
-            reference_points = topk_coords_logits.sigmoid()
-            init_reference_points = reference_points
-            if query_embeds is not None:
-                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
-            else:
-                target = torch.gather(
-                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
-                ).detach()
-        else:
-            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
-            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
-            init_reference_points = reference_points
+        if output_attentions:
+            all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
 
-        decoder_outputs = self.decoder(
-            inputs_embeds=target,
-            vision_encoder_hidden_states=encoder_outputs[0],
-            vision_encoder_attention_mask=mask_flatten,
-            text_encoder_hidden_states=encoder_outputs[1],
-            text_encoder_attention_mask=~text_token_mask,
-            reference_points=reference_points,
-            spatial_shapes=spatial_shapes,
-            level_start_index=level_start_index,
-            valid_ratios=valid_ratios,
-            self_attn_mask=None,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+        if not return_dict:
+            enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
+            return tuple(v for v in enc_outputs if v is not None)
+        return GroundingDinoEncoderOutput(
+            last_hidden_state_vision=vision_features,
+            last_hidden_state_text=text_features,
+            vision_hidden_states=encoder_vision_states,
+            text_hidden_states=encoder_text_states,
+            attentions=all_attns,
         )
 
-        if not return_dict:
-            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
-            tuple_outputs = (
-                (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
-            )
 
-            return tuple_outputs
+class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`].
 
-        return GroundingDinoModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            init_reference_points=init_reference_points,
-            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
-            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
-            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
-            encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
-            encoder_text_hidden_states=encoder_outputs.text_hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-            enc_outputs_class=enc_outputs_class,
-            enc_outputs_coord_logits=enc_outputs_coord_logits,
-        )
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
 
+    Some tweaks for Grounding DINO:
 
-@add_start_docstrings(
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: GroundingDinoConfig
     """
-    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
-    for tasks such as COCO detection.
-    """,
-    GROUNDING_DINO_START_DOCSTRING,
-)
-class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
-    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
     def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
-        # Deformable DETR encoder-decoder model
-        self.model = GroundingDinoModel(config)
-
-        # Detection heads on top
-        _class_embed = GroundingDinoContrastiveEmbedding(config)
-        _bbox_embed = GroundingDinoMLPPredictionHead(
-            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        self.dropout = config.dropout
+        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = GroundingDinoMLPPredictionHead(
+            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
         )
+        self.gradient_checkpointing = False
 
-        if config.decoder_bbox_embed_share:
-            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
-        else:
-            self.bbox_embed = nn.ModuleList(_bbox_embed, config.decoder_layers)
-        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
-        # hack implementation for two-stage
-        self.model.decoder.bbox_embed = self.bbox_embed
-        self.model.decoder.class_embed = self.class_embed
+        # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.query_scale = None
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+    def forward(
+        self,
+        inputs_embeds,
+        vision_encoder_hidden_states,
+        vision_encoder_attention_mask=None,
+        text_encoder_hidden_states=None,
+        text_encoder_attention_mask=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        self_attn_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Last hidden state from encoder related to vision feature map.
+            vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+            text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Last hidden state from encoder related to text features.
+            text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+            self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`):
+                Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`:
+                - 1 for queries that are real (i.e. **not masked**),
+                - 0 for queries that are padding (i.e. **masked**).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_attns = () if output_attentions else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            elif num_coordinates == 2:
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Reference points' last dimension must be of size 2")
+            query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
+            query_pos = self.reference_points_head(query_pos)
+
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # withtout layer norm
+            if output_hidden_states:
+                all_hidden_states += (self.layer_norm(hidden_states),)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states=hidden_states,
+                    position_embeddings=query_pos,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                num_coordinates = reference_points.shape[-1]
+                if num_coordinates == 4:
+                    new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                elif num_coordinates == 2:
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    raise ValueError(
+                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                    )
+                reference_points = new_reference_points.detach()
+
+            intermediate += (self.layer_norm(hidden_states),)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if output_attentions:
+            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_attns,
+                ]
+                if v is not None
+            )
+        return GroundingDinoDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_attns,
+        )
+
+
+# these correspond to [CLS], [SEP], . and ?
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
+
+
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+    Returns:
+        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
+        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
+        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
+    """
+    batch_size, num_token = input_ids.shape
+    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
+    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@add_start_docstrings(
+    """
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+class GroundingDinoModel(GroundingDinoPreTrainedModel):
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = GroundingDinoConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
+            input_proj_list = []
+            for i in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[i]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj_vision = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj_vision = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        # Create text backbone
+        self.text_backbone = GroundingDinoTextModel(config.text_config)
+        self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
+
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = GroundingDinoEncoder(config)
+        self.decoder = GroundingDinoDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+            if (
+                config.two_stage_bbox_embed_share
+                and config.decoder_bbox_embed_share
+                and self.decoder.bbox_embed is not None
+            ):
+                self.encoder_output_bbox_embed = self.decoder.bbox_embed
+            else:
+                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
+
+            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
+        else:
+            self.reference_points = nn.Embedding(config.num_queries, 4)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
+            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
+            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        current_position = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
+            mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            current_position += height * width
+
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
 
     @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values: torch.FloatTensor,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor = None,
-        token_type_ids: torch.LongTensor = None,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
-        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        pixel_mask: Optional[Tensor] = None,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
     ):
         r"""
-        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
-            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
-            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
-            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
-
         Returns:
 
         Examples:
 
         ```python
-        >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
+        >>> from transformers import AutoProcessor, AutoModel
         >>> from PIL import Image
         >>> import requests
 
@@ -2611,149 +2798,215 @@ def forward(
         >>> text = "a cat."
 
         >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = AutoModel.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
-        >>> target_sizes = torch.tensor([image.size[::-1]])
-        >>> results = processor.image_processor.post_process_object_detection(
-        ...     outputs, threshold=0.35, target_sizes=target_sizes
-        ... )[0]
-        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
-        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
-        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
-        outputs = self.model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            pixel_mask=pixel_mask,
-            encoder_outputs=encoder_outputs,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
+
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
+
+        # Extract text features from text backbone
+        text_outputs = self.text_backbone(
+            input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
         )
+        text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
+        text_features = self.text_projection(text_features)
 
-        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
-        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
-        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
-        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
-        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
 
-        # class logits + predicted bounding boxes
-        outputs_classes = []
-        outputs_coords = []
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
 
-        num_levels = hidden_states.shape[1]
-        for level in range(num_levels):
-            if level == 0:
-                reference = init_reference_points
-            else:
-                reference = inter_references_points[:, level - 1]
-            reference = torch.special.logit(reference, eps=1e-5)
-            outputs_class = self.class_embed[level](
-                vision_hidden_state=hidden_states[:, level],
-                text_hidden_state=enc_text_hidden_state,
-                text_token_mask=attention_mask.bool(),
-            )
-            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
 
-            reference_coordinates = reference.shape[-1]
-            if reference_coordinates == 4:
-                outputs_coord_logits = delta_bbox + reference
-            elif reference_coordinates == 2:
-                delta_bbox[..., :2] += reference
-                outputs_coord_logits = delta_bbox
-            else:
-                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
-            outputs_coord = outputs_coord_logits.sigmoid()
-            outputs_classes.append(outputs_class)
-            outputs_coords.append(outputs_coord)
-        outputs_class = torch.stack(outputs_classes)
-        outputs_coord = torch.stack(outputs_coords)
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(vision_features):
+            sources.append(self.input_proj_vision[level](source))
+            masks.append(mask)
 
-        logits = outputs_class[-1]
-        pred_boxes = outputs_coord[-1]
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj_vision[level](vision_features[-1][0])
+                else:
+                    source = self.input_proj_vision[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
 
-        loss, loss_dict, auxiliary_outputs = None, None, None
-        if labels is not None:
-            # First: create the matcher
-            matcher = GroundingDinoHungarianMatcher(
-                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+        # Create queries
+        query_embeds = None
+        if self.config.embedding_init_target or self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                vision_features=source_flatten,
+                vision_attention_mask=~mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=~text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
             )
-            # Second: create the criterion
-            losses = ["labels", "boxes", "cardinality"]
-            criterion = GroundingDinoLoss(
-                matcher=matcher,
-                num_classes=self.config.num_labels,
-                focal_alpha=self.config.focal_alpha,
-                losses=losses,
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
+            encoder_outputs = GroundingDinoEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
             )
-            criterion.to(self.device)
-            # Third: compute the losses, based on outputs and labels
-            outputs_loss = {}
-            outputs_loss["logits"] = logits
-            outputs_loss["pred_boxes"] = pred_boxes
-            if self.config.auxiliary_loss:
-                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
-                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
-            if self.config.two_stage:
-                enc_outputs_coord = outputs[-1].sigmoid()
-                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
 
-            loss_dict = criterion(outputs_loss, labels)
-            # Fourth: compute total loss, as a weighted sum of the various losses
-            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
-            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
-            if self.config.auxiliary_loss:
-                aux_weight_dict = {}
-                for i in range(self.config.decoder_layers - 1):
-                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
-                weight_dict.update(aux_weight_dict)
-            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+        # Fifth, prepare decoder inputs
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
 
-        if not return_dict:
-            if auxiliary_outputs is not None:
-                output = (logits, pred_boxes) + auxiliary_outputs + outputs
-            else:
-                output = (logits, pred_boxes) + outputs
-            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+            # hack implementation as in two-stage Deformable DETR
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.encoder_output_class_embed(
+                object_query_embedding, encoder_outputs[1], text_token_mask
+            )
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
 
-            return tuple_outputs
+            # only keep top scoring `config.num_queries` proposals
+            topk = self.config.num_queries
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
 
-        dict_outputs = GroundingDinoObjectDetectionOutput(
-            loss=loss,
-            loss_dict=loss_dict,
-            logits=logits,
-            pred_boxes=pred_boxes,
-            last_hidden_state=outputs.last_hidden_state,
-            auxiliary_outputs=auxiliary_outputs,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
-            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
-            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
-            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-            intermediate_hidden_states=outputs.intermediate_hidden_states,
-            intermediate_reference_points=outputs.intermediate_reference_points,
-            init_reference_points=outputs.init_reference_points,
-            enc_outputs_class=outputs.enc_outputs_class,
-            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            if query_embeds is not None:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
+        else:
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=~text_token_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            self_attn_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        return dict_outputs
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (
+                (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
+            )
+
+            return tuple_outputs
+
+        return GroundingDinoModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            init_reference_points=init_reference_points,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
+            encoder_text_hidden_states=encoder_outputs.text_hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
 
 
 # Copied from transformers.models.detr.modeling_detr.dice_loss
@@ -2986,699 +3239,446 @@ def forward(self, outputs, targets):
         return losses
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
-class GroundingDinoMLPPredictionHead(nn.Module):
-    """
-    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
-    height and width of a bounding box w.r.t. an image.
-
-    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-
-    """
-
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
-class GroundingDinoHungarianMatcher(nn.Module):
-    """
-    This class computes an assignment between the targets and the predictions of the network.
-
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
-    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
-    un-matched (and thus treated as non-objects).
-
-    Args:
-        class_cost:
-            The relative weight of the classification error in the matching cost.
-        bbox_cost:
-            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
-        giou_cost:
-            The relative weight of the giou loss of the bounding box in the matching cost.
-    """
-
-    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
-        super().__init__()
-        requires_backends(self, ["scipy"])
-
-        self.class_cost = class_cost
-        self.bbox_cost = bbox_cost
-        self.giou_cost = giou_cost
-        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
-            raise ValueError("All costs of the Matcher can't be 0")
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """
-        Args:
-            outputs (`dict`):
-                A dictionary that contains at least these entries:
-                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
-            targets (`List[dict]`):
-                A list of targets (len(targets) = batch_size), where each target is a dict containing:
-                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
-                  ground-truth
-                 objects in the target) containing the class labels
-                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
-
-        Returns:
-            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
-            - index_i is the indices of the selected predictions (in order)
-            - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        batch_size, num_queries = outputs["logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
-
-        # Also concat the target labels and boxes
-        target_ids = torch.cat([v["class_labels"] for v in targets])
-        target_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost.
-        alpha = 0.25
-        gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
-        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
-
-        # Compute the L1 cost between boxes
-        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
-
-        # Compute the giou cost between boxes
-        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
-
-        # Final cost matrix
-        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
-        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
-
-        sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
-
-
-# Copied from transformers.models.detr.modeling_detr._upcast
-def _upcast(t: Tensor) -> Tensor:
-    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
-    if t.is_floating_point():
-        return t if t.dtype in (torch.float32, torch.float64) else t.float()
-    else:
-        return t if t.dtype in (torch.int32, torch.int64) else t.int()
-
-
-# Copied from transformers.models.detr.modeling_detr.box_area
-def box_area(boxes: Tensor) -> Tensor:
-    """
-    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
-
-    Args:
-        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
-            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
-            < x2` and `0 <= y1 < y2`.
-
-    Returns:
-        `torch.FloatTensor`: a tensor containing the area for each box.
-    """
-    boxes = _upcast(boxes)
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
-
-# Copied from transformers.models.detr.modeling_detr.box_iou
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
-    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-
-# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
-
-    Returns:
-        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
-        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
-    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
-        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
-    iou, union = box_iou(boxes1, boxes2)
-
-    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
-    area = width_height[:, :, 0] * width_height[:, :, 1]
-
-    return iou - (area - union) / area
-
-
-# Copied from transformers.models.detr.modeling_detr._max_by_axis
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-# Copied from transformers.models.detr.modeling_detr.NestedTensor
-class NestedTensor(object):
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    if tensor_list[0].ndim == 3:
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        batch_shape = [len(tensor_list)] + max_size
-        batch_size, num_channels, height, width = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("Only 3-dimensional tensors are supported")
-    return NestedTensor(tensor, mask)
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
-class GroundingDinoTextEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDinoMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
 
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values_length: int = 0,
-    ) -> torch.Tensor:
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
+    """
 
-        seq_length = input_shape[1]
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
 
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
+class GroundingDinoHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
 
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
 
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
-class GroundingDinoTextSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        requires_backends(self, ["scipy"])
 
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
 
-        self.is_decoder = config.is_decoder
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
 
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
 
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
 
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
 
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
 
-        use_cache = past_key_value is not None
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
 
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
-            if use_cache:
-                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-                    -1, 1
-                )
-            else:
-                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
 
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
-            attention_scores = attention_scores + attention_mask
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
 
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
 
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
 
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
+    union = area1[:, None] + area2 - inter
 
+    iou = inter / union
+    return iou, union
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
-class GroundingDinoTextSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
 
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
-class GroundingDinoTextAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = GroundingDinoTextSelfOutput(config)
-        self.pruned_heads = set()
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
 
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+    return iou - (area - union) / area
 
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
+# Copied from transformers.models.detr.modeling_detr._max_by_axis
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
-class GroundingDinoTextIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
         else:
-            self.intermediate_act_fn = config.hidden_act
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
+    def decompose(self):
+        return self.tensors, self.mask
 
+    def __repr__(self):
+        return str(self.tensors)
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
-class GroundingDinoTextOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
-class GroundingDinoTextLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = GroundingDinoTextAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = GroundingDinoTextIntermediate(config)
-        self.output = GroundingDinoTextOutput(config)
+@add_start_docstrings(
+    """
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
+    """,
+    GROUNDING_DINO_START_DOCSTRING,
+)
+class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        # Deformable DETR encoder-decoder model
+        self.model = GroundingDinoModel(config)
+
+        # Detection heads on top
+        _class_embed = GroundingDinoContrastiveEmbedding(config)
+        _bbox_embed = GroundingDinoMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
         )
-        attention_output = self_attention_outputs[0]
 
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
+        if config.decoder_bbox_embed_share:
+            self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            self.bbox_embed = nn.ModuleList(_bbox_embed, config.decoder_layers)
+        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
+        # hack implementation for two-stage
+        self.model.decoder.bbox_embed = self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
 
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`"
-                )
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor = None,
+        token_type_ids: torch.LongTensor = None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
 
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        Returns:
 
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
+        Examples:
 
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
+        ```python
+        >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
 
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
 
-        return outputs
+        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
 
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
 
+        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = processor.image_processor.post_process_object_detection(
+        ...     outputs, threshold=0.35, target_sizes=target_sizes
+        ... )[0]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
+        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
+        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
-class GroundingDinoTextEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
+        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
 
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
 
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
+        num_levels = hidden_states.shape[1]
+        for level in range(num_levels):
+            if level == 0:
+                reference = init_reference_points
+            else:
+                reference = inter_references_points[:, level - 1]
+            reference = torch.special.logit(reference, eps=1e-5)
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=attention_mask.bool(),
+            )
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
 
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
+            reference_coordinates = reference.shape[-1]
+            if reference_coordinates == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference_coordinates == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
             else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
 
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
 
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = GroundingDinoHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = GroundingDinoLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+            )
+            criterion.to(self.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs[-1].sigmoid()
+                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
 
         if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = GroundingDinoObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            last_hidden_state=outputs.last_hidden_state,
+            auxiliary_outputs=auxiliary_outputs,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
+            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
         )
+
+        return dict_outputs
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index d30b79a442c299..e6341a46328e56 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -543,11 +543,6 @@ def test_forward_signature(self):
             arg_names = [*signature.parameters.keys()]
 
             expected_arg_names = ["pixel_values", "input_ids"]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" in arg_names
-                else []
-            )
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
     def test_different_timm_backbone(self):
@@ -747,3 +742,12 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
             [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         )
         assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3)
+
+        # assert postprocessing
+        results_cpu = processor.image_processor.post_process_object_detection(
+            cpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]]
+        )[0]
+
+        print(results_cpu)
+
+        assert False

From 4854862beccf50930b67a70e6a8e330bf9fa5d07 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 9 Feb 2024 21:58:49 +0100
Subject: [PATCH 193/252] Add comment

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f42e74b4f276ba..0ae82eb7428406 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3583,6 +3583,8 @@ def forward(
         outputs_classes = []
         outputs_coords = []
 
+        # hidden_states are of shape (batch_size, num_stages, height, width)
+        # predict class and bounding box deltas for each stage
         num_levels = hidden_states.shape[1]
         for level in range(num_levels):
             if level == 0:

From c9fcadd19e0199cef7a2fe4700e863892cfb3119 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Fri, 9 Feb 2024 22:12:15 +0100
Subject: [PATCH 194/252] Address comment

---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 0ae82eb7428406..f1aa5215116bcb 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -792,8 +792,6 @@ def forward(
         # repeat attn mask
         if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
             # batch_size, num_queries, num_keys
-            # TODO we shouldn't switch the attention mask here
-            attention_masks = ~attention_masks
             attention_masks = attention_masks[:, None, :, :]
             attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1)
 
@@ -2912,7 +2910,7 @@ def forward(
                 text_features=text_features,
                 text_attention_mask=~text_token_mask,
                 text_position_embedding=None,
-                text_self_attention_masks=text_self_attention_masks,
+                text_self_attention_masks=~text_self_attention_masks,
                 text_position_ids=position_ids,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,

From 6366302a44841bd1d37f7b6dac3179a2986ffb68 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 10 Feb 2024 10:30:43 +0100
Subject: [PATCH 195/252] Remove overwriting of test

---
 .../test_modeling_grounding_dino.py           | 78 +------------------
 1 file changed, 2 insertions(+), 76 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index e6341a46328e56..a5c22f3808fd14 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -19,7 +19,6 @@
 import math
 import re
 import unittest
-from typing import Dict, List, Tuple
 
 from transformers import (
     GroundingDinoConfig,
@@ -370,79 +369,7 @@ def test_attention_outputs(self):
                 ],
             )
 
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            with torch.no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, (List, Tuple)):
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            torch.allclose(
-                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                            ),
-                            msg=(
-                                "Tuple and dict output are not equal. Difference:"
-                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
-                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
-                            ),
-                        )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-            )
-
+    # overwrite since hidden_states are called encoder_text_hidden_states
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -501,9 +428,8 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
     def test_retain_grad_hidden_states_attentions(self):
-        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
         config.output_attentions = True

From b5b1f1bd8002ec466efabebcea6299388cbf90ec Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 10 Feb 2024 10:52:11 +0100
Subject: [PATCH 196/252] Fix bbox_embed

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f1aa5215116bcb..2f2f734bef3057 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3484,7 +3484,7 @@ def __init__(self, config: GroundingDinoConfig):
         if config.decoder_bbox_embed_share:
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            self.bbox_embed = nn.ModuleList(_bbox_embed, config.decoder_layers)
+            self.bbox_embed = nn.ModuleList([copy.deepcopy(_bbox_embed) for _ in range(config.decoder_layers)])
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
         # hack implementation for two-stage
         self.model.decoder.bbox_embed = self.bbox_embed

From 9faa6b4bfdf17407bddbe872e040270bbb688d3d Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 10 Feb 2024 10:59:11 +0100
Subject: [PATCH 197/252] Improve decoder_bbox_embed_share

---
 .../grounding_dino/configuration_grounding_dino.py   |  2 +-
 .../models/grounding_dino/modeling_grounding_dino.py | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 1828b90730b473..12870bf46084e3 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -232,7 +232,7 @@ class GroundingDinoConfig(PretrainedConfig):
         query_dim (`int`, *optional*, defaults to 4):
             The dimension of the query vector.
         decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
-            Whether to share the bbox embedding between the decoder and the two-stage bbox generator.
+            Whether to share the bbox regression head for all decoder layers.
         two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
             Whether to share the bbox embedding between the two-stage bbox generator and the region proposal
             generation.
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2f2f734bef3057..df52e1f282738a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3477,14 +3477,18 @@ def __init__(self, config: GroundingDinoConfig):
 
         # Detection heads on top
         _class_embed = GroundingDinoContrastiveEmbedding(config)
-        _bbox_embed = GroundingDinoMLPPredictionHead(
-            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
-        )
 
         if config.decoder_bbox_embed_share:
+            _bbox_embed = GroundingDinoMLPPredictionHead(
+                input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+            )
             self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         else:
-            self.bbox_embed = nn.ModuleList([copy.deepcopy(_bbox_embed) for _ in range(config.decoder_layers)])
+            for _ in range(config.decoder_layers):
+                _bbox_embed = GroundingDinoMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
+                self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
         # hack implementation for two-stage
         self.model.decoder.bbox_embed = self.bbox_embed

From e7761f7501069b416f03cbfa725fe1558b56fcaa Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 10 Feb 2024 13:20:09 +0100
Subject: [PATCH 198/252] Simplify outputs

---
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index df52e1f282738a..ad201dc8a776eb 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2921,9 +2921,9 @@ def forward(
             encoder_outputs = GroundingDinoEncoderOutput(
                 last_hidden_state_vision=encoder_outputs[0],
                 last_hidden_state_text=encoder_outputs[1],
-                vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-                text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
-                attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
+                vision_hidden_states=encoder_outputs[2] if output_hidden_states else None,
+                text_hidden_states=encoder_outputs[3] if output_hidden_states else None,
+                attentions=encoder_outputs[-1] if output_attentions else None,
             )
 
         # Fifth, prepare decoder inputs

From 09ae5c1708be994896cf8dc791abd3b0bac71e5d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Sat, 10 Feb 2024 20:39:50 +0100
Subject: [PATCH 199/252] Updated post_process_grounded_object_detection

---
 .../processing_grounding_dino.py              | 64 +++++++++++++++----
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 4d336987e4cc73..e5844c2376e9cc 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -18,6 +18,7 @@
 
 from typing import List, Optional, Tuple, Union
 
+from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
@@ -192,18 +193,57 @@ def post_process_grounded_object_detection(
         target_sizes: Union[TensorType, List[Tuple]] = None,
     ):
         """
-        Post-process the output of the model to get the grounded object detection results.
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format and get the associated text label.
+
+        Args:
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
+                Raw outputs of the model.
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The token ids of the input text.
+            box_threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            text_threshold (`float`, *optional*):
+                Score threshold to keep text detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
         """
-        results = self.image_processor.post_process_object_detection(outputs, box_threshold, target_sizes)
-
-        probs = torch.sigmoid(outputs.logits)  # (batch_size, num_queries, 256)
-
-        for idx, (result, prob) in enumerate(zip(results, probs)):
-            labels = result["labels"]
-            # Assuming that selected bboxes are sorted by confidence due to Hungarian matching loss in training
-            prob = prob[: len(labels)]  # len(labels) , 256
-            token_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
-            # overrides result labels key
-            result["labels"] = self.batch_decode(token_ids)
+        logits, boxes = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        probs = torch.sigmoid(logits)  # (batch_size, num_queries, 256)
+        scores = torch.max(probs, dim=-1)[0]  # (batch_size, num_queries)
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for idx, (s, b, p) in enumerate(zip(scores, boxes, probs)):
+            score = s[s > box_threshold]
+            box = b[s > box_threshold]
+            prob = p[s > box_threshold]
+            label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
+            label = self.batch_decode(label_ids)
+            results.append({"scores": score, "labels": label, "boxes": box})
 
         return results

From 9700e223ed635e6b566d8f0eda822a114f8dd6c7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Tue, 13 Feb 2024 12:10:24 +0100
Subject: [PATCH 200/252] Renamed sources to feature_maps

---
 .../grounding_dino/modeling_grounding_dino.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ad201dc8a776eb..2cb3e388a4b837 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2848,23 +2848,23 @@ def forward(
         vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
 
         # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
-        sources = []
+        feature_maps = []
         masks = []
         for level, (source, mask) in enumerate(vision_features):
-            sources.append(self.input_proj_vision[level](source))
+            feature_maps.append(self.input_proj_vision[level](source))
             masks.append(mask)
 
         # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
-        if self.config.num_feature_levels > len(sources):
-            _len_sources = len(sources)
+        if self.config.num_feature_levels > len(feature_maps):
+            _len_sources = len(feature_maps)
             for level in range(_len_sources, self.config.num_feature_levels):
                 if level == _len_sources:
                     source = self.input_proj_vision[level](vision_features[-1][0])
                 else:
-                    source = self.input_proj_vision[level](sources[-1])
+                    source = self.input_proj_vision[level](feature_maps[-1])
                 mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
                 pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
-                sources.append(source)
+                feature_maps.append(source)
                 masks.append(mask)
                 position_embeddings_list.append(pos_l)
 
@@ -2878,7 +2878,7 @@ def forward(
         mask_flatten = []
         lvl_pos_embed_flatten = []
         spatial_shapes = []
-        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+        for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)):
             batch_size, num_channels, height, width = source.shape
             spatial_shape = (height, width)
             spatial_shapes.append(spatial_shape)

From 66ebb6dac323a05c266d1ca144098bbe53a389ec Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Thu, 15 Feb 2024 01:49:05 +0100
Subject: [PATCH 201/252] Improved tests for Grounding Dino ImageProcessor and
 Processor

---
 .../test_image_processing_grounding_dino.py   | 147 ++++++++++++++++++
 .../test_processor_grounding_dino.py          |  46 +++++-
 2 files changed, 192 insertions(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 7641a63ea289c6..aa4f775ef8e345 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -31,6 +31,7 @@
     from PIL import Image
 
     from transformers import GroundingDinoImageProcessor
+    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
 
 
 # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDino
@@ -66,6 +67,8 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_pad = do_pad
+        self.num_queries = 5
+        self.embed_dim = 5
 
     def prepare_image_processor_dict(self):
         return {
@@ -114,6 +117,13 @@ def expected_output_image_shape(self, images):
         height, width = self.get_expected_values(images, batched=True)
         return self.num_channels, height, width
 
+    def get_fake_grounding_dino_output(self):
+        torch.manual_seed(42)
+        return GroundingDinoObjectDetectionOutput(
+            pred_boxes=torch.rand(self.batch_size, self.num_queries, 4),
+            logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
+        )
+
     def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
         return prepare_image_inputs(
             batch_size=self.batch_size,
@@ -160,6 +170,22 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
         self.assertEqual(image_processor.do_pad, False)
 
+    def test_post_process_object_detection(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        outputs = self.image_processor_tester.get_fake_grounding_dino_output()
+        results = image_processor.post_process_object_detection(outputs, threshold=0.0)
+
+        self.assertEqual(len(results), self.image_processor_tester.batch_size)
+        self.assertEqual(list(results[0].keys()), ["scores", "labels", "boxes"])
+        self.assertEqual(results[0]["boxes"].shape, (self.image_processor_tester.num_queries, 4))
+        self.assertEqual(results[0]["scores"].shape, (self.image_processor_tester.num_queries,))
+
+        expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220])
+        self.assertTrue(torch.allclose(results[0]["scores"], expected_scores, atol=1e-4))
+
+        expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947])
+        self.assertTrue(torch.allclose(results[0]["boxes"][0], expected_box_slice, atol=1e-4))
+
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
@@ -204,6 +230,63 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         expected_size = torch.tensor([800, 1066])
         self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations_batched(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # Creating a batch of images and targets
+        image_batch = [image, image]
+        target_batch = [target, target]
+
+        # encode them
+        image_processing = GroundingDinoImageProcessor()
+        encoding = image_processing(images=image_batch, annotations=target_batch, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([2, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(encoding["pixel_values"][1, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["size"], expected_size))
+
     @slow
     def test_call_pytorch_with_coco_panoptic_annotations(self):
         # prepare image, target and masks_path
@@ -252,3 +335,67 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         # verify size
         expected_size = torch.tensor([800, 1066])
         self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations_batched(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        image_batch = [image, image]
+        target_batch = [target, target]
+
+        # encode them
+        image_processing = GroundingDinoImageProcessor(format="coco_panoptic")
+        encoding = image_processing(
+            images=image_batch, annotations=target_batch, masks_path=masks_path, return_tensors="pt"
+        )
+
+        # verify pixel values
+        expected_shape = torch.Size([2, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(encoding["pixel_values"][1, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        self.assertEqual(encoding["labels"][1]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["size"], expected_size))
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 32a6a386d01a6a..875d9c8a20b107 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -24,13 +24,17 @@
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from PIL import Image
 
     from transformers import GroundingDinoImageProcessor
+    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
 
 
 @require_vision
@@ -57,6 +61,11 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        self.batch_size = 7
+        self.num_queries = 5
+        self.embed_dim = 5
+        self.seq_length = 5
+
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -85,6 +94,41 @@ def prepare_image_inputs(self):
 
         return image_inputs
 
+    def get_fake_grounding_dino_output(self):
+        torch.manual_seed(42)
+        return GroundingDinoObjectDetectionOutput(
+            pred_boxes=torch.rand(self.batch_size, self.num_queries, 4),
+            logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
+        )
+
+    def get_fake_grounding_dino_input_ids(self):
+        input_ids = torch.tensor([101, 1037, 4937, 1012, 102])
+        return torch.stack([input_ids] * self.batch_size, dim=0)
+
+    def test_post_process_grounded_object_detection(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        grounding_dino_output = self.get_fake_grounding_dino_output()
+        grounding_dino_input_ids = self.get_fake_grounding_dino_input_ids()
+
+        post_processed = processor.post_process_grounded_object_detection(
+            grounding_dino_output, grounding_dino_input_ids
+        )
+
+        self.assertEqual(len(post_processed), self.batch_size)
+        self.assertEqual(list(post_processed[0].keys()), ["scores", "labels", "boxes"])
+        self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4))
+        self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,))
+
+        expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220])
+        self.assertTrue(torch.allclose(post_processed[0]["scores"], expected_scores, atol=1e-4))
+
+        expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947])
+        self.assertTrue(torch.allclose(post_processed[0]["boxes"][0], expected_box_slice, atol=1e-4))
+
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer
     def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()

From b27e7fbe7fbe7179a7a10d400a26f06eb4835a5e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Thu, 15 Feb 2024 02:03:22 +0100
Subject: [PATCH 202/252] Fixed test requirements and imports

---
 .../test_image_processing_grounding_dino.py            |  7 ++++---
 .../grounding_dino/test_processor_grounding_dino.py    | 10 ++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index aa4f775ef8e345..f1a5a2ea22cab2 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -27,12 +27,13 @@
 if is_torch_available():
     import torch
 
+    if is_vision_available():
+        from transformers import GroundingDinoImageProcessor
+        from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GroundingDinoImageProcessor
-    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
-
 
 # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDino
 class GroundingDinoImageProcessingTester(unittest.TestCase):
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 875d9c8a20b107..d764e6ef999c31 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -23,20 +23,22 @@
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
 
 if is_torch_available():
     import torch
 
+    if is_vision_available():
+        from transformers import GroundingDinoImageProcessor
+        from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GroundingDinoImageProcessor
-    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
-
 
+@require_torch
 @require_vision
 class GroundingDinoProcessorTest(unittest.TestCase):
     def setUp(self):

From 17387df475ff75cbe9b0962322f463a1760a64f4 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Thu, 15 Feb 2024 15:14:45 +0100
Subject: [PATCH 203/252] Fixed image_processing

---
 README_hd.md                                  |   1 -
 .../image_processing_grounding_dino.py        | 143 +++++++++++++++---
 .../test_image_processing_grounding_dino.py   |  10 +-
 3 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/README_hd.md b/README_hd.md
index 2726c1df6d2ed1..6205bc05221b53 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -334,7 +334,6 @@ conda install conda-forge::transformers
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others से) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. द्वाराअनुसंधान पत्र [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) के साथ जारी किया गया
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 649541b682ce65..0f2101a556f105 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -791,6 +791,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
             Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
             for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
         do_pad (`bool`, *optional*, defaults to `True`):
             Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
             overridden by the `do_pad` parameter in the `preprocess` method.
@@ -810,6 +814,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
+        do_convert_annotations: Optional[bool] = None,
         do_pad: bool = True,
         **kwargs,
     ) -> None:
@@ -828,6 +833,10 @@ def __init__(
         size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
         size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
         super().__init__(**kwargs)
         self.format = format
         self.do_resize = do_resize
@@ -836,6 +845,7 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
@@ -1013,18 +1023,64 @@ def rescale(
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
         Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
         """
         return normalize_annotation(annotation, image_size=image_size)
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
         image: np.ndarray,
         output_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
@@ -1043,25 +1099,33 @@ def _pad_image(
             data_format=data_format,
             input_data_format=input_data_format,
         )
-        return padded_image
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
         constant_values: Union[float, Iterable[float]] = 0,
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
     ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
 
         Args:
-            image (`np.ndarray`):
-                Image to pad.
+            images (List[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
             constant_values (`float` or `Iterable[float]`, *optional*):
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -1077,19 +1141,29 @@ def pad(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
         """
         pad_size = get_max_height_width(images, input_data_format=input_data_format)
 
-        padded_images = [
-            self._pad_image(
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
                 image,
                 pad_size,
+                annotation,
                 constant_values=constant_values,
                 data_format=data_format,
                 input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
             )
-            for image in images
-        ]
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
@@ -1099,7 +1173,14 @@ def pad(
             ]
             data["pixel_mask"] = masks
 
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
     def preprocess(
@@ -1114,6 +1195,7 @@ def preprocess(
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[Union[int, float]] = None,
         do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
@@ -1157,12 +1239,17 @@ def preprocess(
                 Rescale factor to use when rescaling the image.
             do_normalize (`bool`, *optional*, defaults to self.do_normalize):
                 Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
             image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
                 Mean to use when normalizing the image.
             image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
+                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
             format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -1203,6 +1290,9 @@ def preprocess(
         do_normalize = self.do_normalize if do_normalize is None else do_normalize
         image_mean = self.image_mean if image_mean is None else image_mean
         image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
         do_pad = self.do_pad if do_pad is None else do_pad
         format = self.format if format is None else format
 
@@ -1306,29 +1396,34 @@ def preprocess(
             images = [
                 self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
             ]
-            if annotations is not None:
-                annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                    for annotation, image in zip(annotations, images)
-                ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
 
         if do_pad:
             # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                return_tensors=return_tensors,
+                update_bboxes=do_convert_annotations,
             )
         else:
             images = [
                 to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                 for image in images
             ]
-            data = {"pixel_values": images}
-
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
-        if annotations is not None:
-            encoded_inputs["labels"] = [
-                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
 
         return encoded_inputs
 
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index f1a5a2ea22cab2..89d26b061591fc 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -35,7 +35,6 @@
     from PIL import Image
 
 
-# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDino
 class GroundingDinoImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
@@ -71,6 +70,7 @@ def __init__(
         self.num_queries = 5
         self.embed_dim = 5
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_processor_dict with DeformableDetr->GroundingDino
     def prepare_image_processor_dict(self):
         return {
             "do_resize": self.do_resize,
@@ -83,6 +83,7 @@ def prepare_image_processor_dict(self):
             "do_pad": self.do_pad,
         }
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.get_expected_values with DeformableDetr->GroundingDino
     def get_expected_values(self, image_inputs, batched=False):
         """
         This function computes the expected height and width when providing images to GroundingDinoImageProcessor,
@@ -114,6 +115,7 @@ def get_expected_values(self, image_inputs, batched=False):
 
         return expected_height, expected_width
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.expected_output_image_shape with DeformableDetr->GroundingDino
     def expected_output_image_shape(self, images):
         height, width = self.get_expected_values(images, batched=True)
         return self.num_channels, height, width
@@ -125,6 +127,7 @@ def get_fake_grounding_dino_output(self):
             logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
         )
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_inputs with DeformableDetr->GroundingDino
     def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
         return prepare_image_inputs(
             batch_size=self.batch_size,
@@ -139,7 +142,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDino
 class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
 
@@ -150,6 +152,7 @@ def setUp(self):
     def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_properties with DeformableDetr->GroundingDino
     def test_image_processor_properties(self):
         image_processing = self.image_processing_class(**self.image_processor_dict)
         self.assertTrue(hasattr(image_processing, "image_mean"))
@@ -160,6 +163,7 @@ def test_image_processor_properties(self):
         self.assertTrue(hasattr(image_processing, "do_pad"))
         self.assertTrue(hasattr(image_processing, "size"))
 
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_from_dict_with_kwargs with DeformableDetr->GroundingDino
     def test_image_processor_from_dict_with_kwargs(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
         self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
@@ -188,6 +192,7 @@ def test_post_process_object_detection(self):
         self.assertTrue(torch.allclose(results[0]["boxes"][0], expected_box_slice, atol=1e-4))
 
     @slow
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_detection_annotations with DeformableDetr->GroundingDino
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
@@ -289,6 +294,7 @@ def test_call_pytorch_with_coco_detection_annotations_batched(self):
         self.assertTrue(torch.allclose(encoding["labels"][1]["size"], expected_size))
 
     @slow
+    # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino
     def test_call_pytorch_with_coco_panoptic_annotations(self):
         # prepare image, target and masks_path
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")

From eed03aa924752ee30a7b12537ed5f1921f91c70e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Thu, 15 Feb 2024 15:16:34 +0100
Subject: [PATCH 204/252] Fixed processor tests

---
 .../models/grounding_dino/test_processor_grounding_dino.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index d764e6ef999c31..a788d09ca7eed1 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -30,13 +30,13 @@
 if is_torch_available():
     import torch
 
-    if is_vision_available():
-        from transformers import GroundingDinoImageProcessor
-        from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
 
 if is_vision_available():
     from PIL import Image
 
+    from transformers import GroundingDinoImageProcessor
+
 
 @require_torch
 @require_vision

From 3e8772a5b44032d7f64a0a92f38ca2eab86bd8f0 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Thu, 15 Feb 2024 15:33:32 +0100
Subject: [PATCH 205/252] Fixed imports for image processing tests

---
 .../grounding_dino/test_image_processing_grounding_dino.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 89d26b061591fc..23b0567f7dd40b 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -27,13 +27,13 @@
 if is_torch_available():
     import torch
 
-    if is_vision_available():
-        from transformers import GroundingDinoImageProcessor
-        from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+    from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput
 
 if is_vision_available():
     from PIL import Image
 
+    from transformers import GroundingDinoImageProcessor
+
 
 class GroundingDinoImageProcessingTester(unittest.TestCase):
     def __init__(

From c549574f36b12a1ece777037a302fda061bc5d5e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pacheco@limehome.com>
Date: Fri, 16 Feb 2024 00:53:07 +0100
Subject: [PATCH 206/252] Fix copies

---
 src/transformers/models/deta/modeling_deta.py                | 5 +++--
 .../models/grounding_dino/modeling_grounding_dino.py         | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 7e1b014c834eff..9078c6a02e7d8d 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -687,13 +687,14 @@ def forward(
             batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
         )
         # batch_size, num_queries, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
             offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
             sampling_locations = (
                 reference_points[:, :, None, :, None, :]
                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
             )
-        elif reference_points.shape[-1] == 4:
+        elif num_coordinates == 4:
             sampling_locations = (
                 reference_points[:, :, None, :, None, :2]
                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2cb3e388a4b837..0613a9d5ba8953 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -644,7 +644,8 @@ def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
 
     def _reset_parameters(self):
         nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        thetas = torch.arange(self.n_heads, dtype=torch.int64).float() * (2.0 * math.pi / self.n_heads)
+        default_dtype = torch.get_default_dtype()
+        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = (
             (grid_init / grid_init.abs().max(-1, keepdim=True)[0])

From 126dd83f902a0a789a2373efbe0c574bc156ceb6 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 17 Feb 2024 18:02:54 +0100
Subject: [PATCH 207/252] Updated modeling

---
 .../grounding_dino/modeling_grounding_dino.py    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 0613a9d5ba8953..ca6242df2bee5b 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -45,7 +45,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
-from ...utils import is_ninja_available, logging
+from ...utils import is_ninja_available, logging, is_accelerate_available
 from ..auto import AutoBackbone
 from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
 
@@ -102,6 +102,10 @@ def load_cuda_kernels():
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDinoConfig"
@@ -3204,11 +3208,11 @@ def forward(self, outputs, targets):
         # Compute the average number of target boxes accross all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        # (Niels): comment out function below, distributed training to be added
-        # if is_dist_avail_and_initialized():
-        #     torch.distributed.all_reduce(num_boxes)
-        # (Niels) in original implementation, num_boxes is divided by get_world_size()
-        num_boxes = torch.clamp(num_boxes, min=1).item()
+        world_size = 1
+        if PartialState._shared_state != {}:
+            num_boxes = reduce(num_boxes)
+            world_size = PartialState().num_processes
+        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
 
         # Compute all the requested losses
         losses = {}

From d24335b8ffeff8139b405662818ee60838925ce7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 17 Feb 2024 18:13:53 +0100
Subject: [PATCH 208/252] Fix style

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ca6242df2bee5b..ce1c94fed9565c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -45,7 +45,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
-from ...utils import is_ninja_available, logging, is_accelerate_available
+from ...utils import is_accelerate_available, is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
 

From 0b8f4e8a14fb75b70a3fa50a7efbd011d4abb71f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 21 Feb 2024 20:40:58 +0100
Subject: [PATCH 209/252] Moved functions to correct position

---
 .../grounding_dino/modeling_grounding_dino.py | 454 +++++++++---------
 1 file changed, 232 insertions(+), 222 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index ce1c94fed9565c..a6500018129113 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -52,6 +52,8 @@
 
 logger = logging.get_logger(__name__)
 
+MultiScaleDeformableAttention = None
+
 
 def load_cuda_kernels():
     from torch.utils.cpp_extension import load
@@ -609,7 +611,7 @@ def multi_scale_deformable_attention(
     return output.transpose(1, 2).contiguous()
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino,Deformable DETR->Grounding DINO
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino, Deformable DETR->Grounding DINO
 class GroundingDinoMultiscaleDeformableAttention(nn.Module):
     """
     Multiscale deformable attention as proposed in Deformable DETR.
@@ -617,6 +619,14 @@ class GroundingDinoMultiscaleDeformableAttention(nn.Module):
 
     def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
         super().__init__()
+
+        kernel_loaded = MultiScaleDeformableAttention is not None
+        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
+            try:
+                load_cuda_kernels()
+            except Exception as e:
+                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+
         if config.d_model % num_heads != 0:
             raise ValueError(
                 f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
@@ -3012,6 +3022,106 @@ def forward(
         )
 
 
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDinoMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# Copied from transformers.models.detr.modeling_detr._max_by_axis
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
 # Copied from transformers.models.detr.modeling_detr.dice_loss
 def dice_loss(inputs, targets, num_boxes):
     """
@@ -3064,6 +3174,127 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
     return loss.mean(1).sum() / num_boxes
 
 
+# Copied from transformers.models.detr.modeling_detr.NestedTensor
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
+class GroundingDinoHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino
 class GroundingDinoLoss(nn.Module):
     """
@@ -3242,227 +3473,6 @@ def forward(self, outputs, targets):
         return losses
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
-class GroundingDinoMLPPredictionHead(nn.Module):
-    """
-    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
-    height and width of a bounding box w.r.t. an image.
-
-    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-
-    """
-
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
-class GroundingDinoHungarianMatcher(nn.Module):
-    """
-    This class computes an assignment between the targets and the predictions of the network.
-
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
-    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
-    un-matched (and thus treated as non-objects).
-
-    Args:
-        class_cost:
-            The relative weight of the classification error in the matching cost.
-        bbox_cost:
-            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
-        giou_cost:
-            The relative weight of the giou loss of the bounding box in the matching cost.
-    """
-
-    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
-        super().__init__()
-        requires_backends(self, ["scipy"])
-
-        self.class_cost = class_cost
-        self.bbox_cost = bbox_cost
-        self.giou_cost = giou_cost
-        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
-            raise ValueError("All costs of the Matcher can't be 0")
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """
-        Args:
-            outputs (`dict`):
-                A dictionary that contains at least these entries:
-                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
-            targets (`List[dict]`):
-                A list of targets (len(targets) = batch_size), where each target is a dict containing:
-                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
-                  ground-truth
-                 objects in the target) containing the class labels
-                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
-
-        Returns:
-            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
-            - index_i is the indices of the selected predictions (in order)
-            - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        batch_size, num_queries = outputs["logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
-
-        # Also concat the target labels and boxes
-        target_ids = torch.cat([v["class_labels"] for v in targets])
-        target_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost.
-        alpha = 0.25
-        gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
-        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
-
-        # Compute the L1 cost between boxes
-        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
-
-        # Compute the giou cost between boxes
-        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
-
-        # Final cost matrix
-        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
-        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
-
-        sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
-
-
-# Copied from transformers.models.detr.modeling_detr._upcast
-def _upcast(t: Tensor) -> Tensor:
-    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
-    if t.is_floating_point():
-        return t if t.dtype in (torch.float32, torch.float64) else t.float()
-    else:
-        return t if t.dtype in (torch.int32, torch.int64) else t.int()
-
-
-# Copied from transformers.models.detr.modeling_detr.box_area
-def box_area(boxes: Tensor) -> Tensor:
-    """
-    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
-
-    Args:
-        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
-            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
-            < x2` and `0 <= y1 < y2`.
-
-    Returns:
-        `torch.FloatTensor`: a tensor containing the area for each box.
-    """
-    boxes = _upcast(boxes)
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
-
-# Copied from transformers.models.detr.modeling_detr.box_iou
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
-    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-
-# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
-
-    Returns:
-        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
-        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
-    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
-        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
-    iou, union = box_iou(boxes1, boxes2)
-
-    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
-    area = width_height[:, :, 0] * width_height[:, :, 1]
-
-    return iou - (area - union) / area
-
-
-# Copied from transformers.models.detr.modeling_detr._max_by_axis
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-# Copied from transformers.models.detr.modeling_detr.NestedTensor
-class NestedTensor(object):
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    if tensor_list[0].ndim == 3:
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        batch_shape = [len(tensor_list)] + max_size
-        batch_size, num_channels, height, width = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("Only 3-dimensional tensors are supported")
-    return NestedTensor(tensor, mask)
-
-
 @add_start_docstrings(
     """
     Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,

From eafd39f0f5e178c72d9a3cd9988a165ea1d5467b Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Thu, 22 Feb 2024 00:14:39 +0100
Subject: [PATCH 210/252] Fixed copy issues

---
 .../image_processing_grounding_dino.py        | 56 ++++++++++++++-----
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 0f2101a556f105..8d1bd9181c7fff 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -47,6 +47,8 @@
     to_numpy_array,
     valid_images,
     validate_annotations,
+    validate_kwargs,
+    validate_preprocess_arguments,
 )
 from ...utils import (
     ExplicitEnum,
@@ -849,6 +851,26 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "do_convert_annotations",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
 
     @classmethod
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino
@@ -1296,16 +1318,28 @@ def preprocess(
         do_pad = self.do_pad if do_pad is None else do_pad
         format = self.format if format is None else format
 
-        if do_resize is not None and size is None:
-            raise ValueError("Size and max_size must be specified if do_resize is True.")
-
-        if do_rescale is not None and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+        images = make_list_of_images(images)
 
-        if do_normalize is not None and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
 
-        images = make_list_of_images(images)
         if annotations is not None and isinstance(annotations, dict):
             annotations = [annotations]
 
@@ -1314,12 +1348,6 @@ def preprocess(
                 f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
             )
 
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
         format = AnnotationFormat(format)
         if annotations is not None:
             validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)

From fb1e20291026cc44ca1af94f9480738db0577e69 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 24 Feb 2024 11:18:36 +0100
Subject: [PATCH 211/252] Update
 src/transformers/models/deformable_detr/modeling_deformable_detr.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 .../models/deformable_detr/modeling_deformable_detr.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 003875ff70c544..6313e53d5c16a3 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1456,7 +1456,7 @@ def forward(
                     new_reference_points = new_reference_points.sigmoid()
                 else:
                     raise ValueError(
-                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
                     )
                 reference_points = new_reference_points.detach()
 

From 516de4a3d4090564955963c9e372441c0037ca09 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 24 Feb 2024 11:18:48 +0100
Subject: [PATCH 212/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index a6500018129113..a918b9415f632d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2473,7 +2473,7 @@ def forward(
             elif num_coordinates == 2:
                 reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
             else:
-                raise ValueError("Reference points' last dimension must be of size 2")
+                raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
             query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
             query_pos = self.reference_points_head(query_pos)
 

From 0ae3c5d7c2c30ccfc56f5c308853cd8916a749b3 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 24 Feb 2024 11:19:03 +0100
Subject: [PATCH 213/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index a918b9415f632d..a1007b3a63c31e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2535,7 +2535,7 @@ def custom_forward(*inputs):
                     new_reference_points = new_reference_points.sigmoid()
                 else:
                     raise ValueError(
-                        f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
                     )
                 reference_points = new_reference_points.detach()
 

From 7255f13658e06507ad638aca6d2e9f7d19aaceb7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 24 Feb 2024 11:59:41 +0100
Subject: [PATCH 214/252] Keeping consistency custom cuda kernels for MSDA

---
 .../grounding_dino/modeling_grounding_dino.py | 60 +++++++++----------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index a1007b3a63c31e..1a71b6a4cb2768 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -34,6 +34,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
+    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -46,7 +47,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_accelerate_available, is_ninja_available, logging
-from ..auto import AutoBackbone
+from ...utils.backbone_utils import load_backbone
 from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
 
 
@@ -55,10 +56,13 @@
 MultiScaleDeformableAttention = None
 
 
+# Copied from models.deformable_detr.load_cuda_kernels
 def load_cuda_kernels():
     from torch.utils.cpp_extension import load
 
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
+    global MultiScaleDeformableAttention
+
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deta"
     src_files = [
         root / filename
         for filename in [
@@ -68,7 +72,7 @@ def load_cuda_kernels():
         ]
     ]
 
-    load(
+    MultiScaleDeformableAttention = load(
         "MultiScaleDeformableAttention",
         src_files,
         with_cuda=True,
@@ -82,42 +86,14 @@ def load_cuda_kernels():
         ],
     )
 
-    import MultiScaleDeformableAttention as MSDA
-
-    return MSDA
-
-
-# Move this to not compile only when importing, this needs to happen later, like in __init__.
-if is_torch_cuda_available() and is_ninja_available():
-    logger.info("Loading custom CUDA kernels...")
-    try:
-        MultiScaleDeformableAttention = load_cuda_kernels()
-    except Exception as e:
-        logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
-        MultiScaleDeformableAttention = None
-else:
-    MultiScaleDeformableAttention = None
 
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
-if is_scipy_available():
-    from scipy.optimize import linear_sum_assignment
-
 if is_accelerate_available():
     from accelerate import PartialState
     from accelerate.utils import reduce
 
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "GroundingDinoConfig"
-_CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny"
-
-GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "EduardoPacheco/grounding-dino-tiny",
-    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
-]
-
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
 class MultiScaleDeformableAttentionFunction(Function):
@@ -168,6 +144,23 @@ def backward(context, grad_output):
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 
 
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GroundingDinoConfig"
+_CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny"
+
+GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "EduardoPacheco/grounding-dino-tiny",
+    # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
+]
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_timm_available():
+    pass
+
+
 @dataclass
 class GroundingDinoDecoderOutput(ModelOutput):
     """
@@ -448,16 +441,17 @@ def replace_batch_norm(model):
 
 class GroundingDinoConvEncoder(nn.Module):
     """
-    Convolutional backbone using the AutoBackbone API.
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
     nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above.
+
     """
 
     def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = AutoBackbone.from_config(config.backbone_config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():

From 2fb611a288b971952576cccc5912abe9ca7de76d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 24 Feb 2024 12:10:53 +0100
Subject: [PATCH 215/252] Make GroundingDinoProcessor logic clearer

---
 .../processing_grounding_dino.py              | 36 ++++---------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index e5844c2376e9cc..e2f9de4fe271ec 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -108,29 +108,10 @@ def __call__(
             raise ValueError("You have to specify either images or text.")
 
         # Get only text
-        if images is None:
-            text_encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-            return text_encoding
-
-        # add pixel_values
-        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+        if images is not None:
+            encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+        else:
+            encoding_image_processor = {}
 
         if text is not None:
             text_encoding = self.tokenizer(
@@ -152,14 +133,11 @@ def __call__(
                 **kwargs,
             )
         else:
-            text_encoding = None
+            text_encoding = {}
 
-        if text_encoding is not None:
-            # Keeping same order of model_input_names when both images and text
-            text_encoding.update(encoding_image_processor)
-            encoding_image_processor = text_encoding
+        text_encoding.update(encoding_image_processor)
 
-        return encoding_image_processor
+        return text_encoding
 
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
     def batch_decode(self, *args, **kwargs):

From c7a4ef05e38f7ab93784c2fcca3effd1f6810e5a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 24 Feb 2024 12:43:44 +0100
Subject: [PATCH 216/252] Updated Grounding DINO checkpoints

---
 docs/source/en/model_doc/grounding-dino.md           |  2 +-
 .../grounding_dino/configuration_grounding_dino.py   |  8 ++++----
 .../models/grounding_dino/modeling_grounding_dino.py | 12 ++++++------
 .../grounding_dino/test_modeling_grounding_dino.py   |  6 +++---
 .../test_pipelines_zero_shot_object_detection.py     |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index e3541064cc2ef7..d259259ef5bdf3 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -47,7 +47,7 @@ import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForObjectDetection, 
 
-model_id = "EduardoPacheco/grounding-dino-tiny"
+model_id = "IDEA-Research/grounding-dino-tiny"
 
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 12870bf46084e3..b696c204719776 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -24,7 +24,7 @@
 logger = logging.get_logger(__name__)
 
 GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "EduardoPacheco/grounding-dino-tiny": "https://huggingface.co/EduardoPacheco/grounding-dino-tiny/resolve/main/config.json",
+    "IDEA-Research/grounding-dino-tiny": "https://huggingface.co/IDEA-Research/grounding-dino-tiny/resolve/main/config.json",
 }
 
 
@@ -153,7 +153,7 @@ class GroundingDinoConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a
     Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Grounding DINO
-    [EduardoPacheco/grounding-dino-tiny](https://huggingface.co/EduardoPacheco/grounding-dino-tiny) architecture.
+    [IDEA-Research/grounding-dino-tiny](https://huggingface.co/IDEA-Research/grounding-dino-tiny) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -248,10 +248,10 @@ class GroundingDinoConfig(PretrainedConfig):
     ```python
     >>> from transformers import GroundingDinoConfig, GroundingDinoModel
 
-    >>> # Initializing a Grounding DINO EduardoPacheco/grounding-dino-tiny style configuration
+    >>> # Initializing a Grounding DINO IDEA-Research/grounding-dino-tiny style configuration
     >>> configuration = GroundingDinoConfig()
 
-    >>> # Initializing a model (with random weights) from the EduardoPacheco/grounding-dino-tiny style configuration
+    >>> # Initializing a model (with random weights) from the IDEA-Research/grounding-dino-tiny style configuration
     >>> model = GroundingDinoModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 1a71b6a4cb2768..8cbe2b737d283a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -147,10 +147,10 @@ def backward(context, grad_output):
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GroundingDinoConfig"
-_CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny"
+_CHECKPOINT_FOR_DOC = "IDEA-Research/grounding-dino-tiny"
 
 GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "EduardoPacheco/grounding-dino-tiny",
+    "IDEA-Research/grounding-dino-tiny",
     # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
 ]
 
@@ -2804,8 +2804,8 @@ def forward(
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> text = "a cat."
 
-        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = AutoModel.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+        >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -3549,8 +3549,8 @@ def forward(
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> text = "a cat."
 
-        >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny")
-        >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+        >>> model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
 
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index a5c22f3808fd14..86c0fde01b09a0 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -587,10 +587,10 @@ def prepare_text():
 class GroundingDinoModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
-        return AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") if is_vision_available() else None
+        return AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") if is_vision_available() else None
 
     def test_inference_object_detection_head(self):
-        model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device)
+        model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device)
 
         processor = self.default_processor
         image = prepare_img()
@@ -649,7 +649,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         encoding = processor(images=image, text=text, return_tensors="pt")
 
         # 1. run model on CPU
-        model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny")
+        model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
 
         with torch.no_grad():
             cpu_outputs = model(**encoding)
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index b03ef4285e3115..e1a0604a141ee8 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -231,7 +231,7 @@ def test_top_k(self):
     @require_torch
     @slow
     def test_grounding_dino(self):
-        object_detector = pipeline("zero-shot-object-detection", model="EduardoPacheco/grounding-dino-tiny")
+        object_detector = pipeline("zero-shot-object-detection", model="IDEA-Research/grounding-dino-tiny")
 
         outputs = object_detector(
             "http://images.cocodataset.org/val2017/000000039769.jpg",

From 03137fd85afaedb0e80796d2fc797e5c72e42efd Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 1 Mar 2024 14:35:14 +0100
Subject: [PATCH 217/252] Changed tests to correct structure

---
 .../models/grounding_dino/modeling_grounding_dino.py  |  2 +-
 .../grounding_dino/test_modeling_grounding_dino.py    | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 8cbe2b737d283a..59a7df76e56193 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -62,7 +62,7 @@ def load_cuda_kernels():
 
     global MultiScaleDeformableAttention
 
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deta"
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino"
     src_files = [
         root / filename
         for filename in [
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 86c0fde01b09a0..e03f071d35088d 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -662,18 +662,21 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
 
         # 3. assert equivalence
         for key in cpu_outputs.keys():
-            assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
+            self.assertTrue(torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4))
 
         expected_logits = torch.tensor(
             [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]
         )
-        assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3)
+        self.assertTrue(torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3))
 
         # assert postprocessing
         results_cpu = processor.image_processor.post_process_object_detection(
             cpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]]
         )[0]
 
-        print(results_cpu)
+        result_gpu = processor.image_processor.post_process_object_detection(
+            gpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]]
+        )[0]
 
-        assert False
+        self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-3))
+        self.assertTrue(torch.allclose(results_cpu["boxes"], result_gpu["boxes"].cpu(), atol=1e-3))

From 3ee2d78267c91a7d808982a53dae91cdaf1f4fab Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 4 Mar 2024 11:32:56 +0100
Subject: [PATCH 218/252] Updated gpu-cpu equivalence test

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index e03f071d35088d..81175cafc49323 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -662,7 +662,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
 
         # 3. assert equivalence
         for key in cpu_outputs.keys():
-            self.assertTrue(torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4))
+            self.assertTrue(torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-3))
 
         expected_logits = torch.tensor(
             [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]]

From fcfad83e06703230cd48bada1402fb7f8eae2ff7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 4 Mar 2024 12:20:00 +0100
Subject: [PATCH 219/252] fix copies

---
 .../grounding_dino/image_processing_grounding_dino.py      | 3 +--
 .../models/grounding_dino/modeling_grounding_dino.py       | 7 ++++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 8d1bd9181c7fff..8b39d6801ca000 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -1328,7 +1328,6 @@ def preprocess(
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
 
         # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
-
         validate_preprocess_arguments(
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
@@ -1439,8 +1438,8 @@ def preprocess(
                 return_pixel_mask=True,
                 data_format=data_format,
                 input_data_format=input_data_format,
-                return_tensors=return_tensors,
                 update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
             )
         else:
             images = [
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 59a7df76e56193..4f0a6cd423a612 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3434,9 +3434,10 @@ def forward(self, outputs, targets):
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
         world_size = 1
-        if PartialState._shared_state != {}:
-            num_boxes = reduce(num_boxes)
-            world_size = PartialState().num_processes
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_boxes = reduce(num_boxes)
+                world_size = PartialState().num_processes
         num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
 
         # Compute all the requested losses

From fe7cd12193a106b553bb532b58249c30e253c302 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 9 Mar 2024 01:34:05 +0100
Subject: [PATCH 220/252] Update
 src/transformers/models/grounding_dino/processing_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index e2f9de4fe271ec..dec6095d7c9cfd 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -111,7 +111,7 @@ def __call__(
         if images is not None:
             encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
         else:
-            encoding_image_processor = {}
+            encoding_image_processor = BatchFeature()
 
         if text is not None:
             text_encoding = self.tokenizer(

From ebf136f1ec16532b866fde382b037116fae67fd7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 9 Mar 2024 01:34:21 +0100
Subject: [PATCH 221/252] Update
 src/transformers/models/grounding_dino/processing_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index dec6095d7c9cfd..f7acbe4b703481 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -133,7 +133,7 @@ def __call__(
                 **kwargs,
             )
         else:
-            text_encoding = {}
+            text_encoding = BatchEncoding()
 
         text_encoding.update(encoding_image_processor)
 

From 8728db63deb53afd95edd80b8b95b8f33687c1b3 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 9 Mar 2024 01:34:36 +0100
Subject: [PATCH 222/252] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4f0a6cd423a612..e69f2855cadcd3 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -157,10 +157,6 @@ def backward(context, grad_output):
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    pass
-
-
 @dataclass
 class GroundingDinoDecoderOutput(ModelOutput):
     """

From 1be93d6e543d4c35e42c50c1b7b4dfd0c71b6eb2 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sat, 9 Mar 2024 01:34:48 +0100
Subject: [PATCH 223/252] Update
 src/transformers/models/grounding_dino/configuration_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/configuration_grounding_dino.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index b696c204719776..8364b38e42fe43 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -308,7 +308,7 @@ def __init__(
         layer_norm_eps=1e-5,
         **kwargs,
     ):
-        if backbone_config is None:
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
             backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
         elif isinstance(backbone_config, dict):

From 0c99ac786bb856534330694b7f6e598c4fe764ee Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 18:56:31 +0100
Subject: [PATCH 224/252] Fixed erros and style

---
 .../configuration_grounding_dino.py           | 50 ++++++++++++++++---
 .../grounding_dino/modeling_grounding_dino.py |  2 +-
 .../processing_grounding_dino.py              |  1 +
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 8364b38e42fe43..a2ecac0448df89 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -161,6 +161,18 @@ class GroundingDinoConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         text_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
@@ -267,6 +279,10 @@ class GroundingDinoConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
         text_config=None,
         num_queries=900,
         encoder_layers=6,
@@ -308,14 +324,36 @@ def __init__(
         layer_norm_eps=1e-5,
         **kwargs,
     ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is None and backbone is None:
-            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
-            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.get("model_type")
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            backbone_config = config_class.from_dict(backbone_config)
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
+            backbone_config = CONFIG_MAPPING["swin"](
+                window_size=7,
+                image_size=224,
+                embed_dim=96,
+                depths=(2, 2, 6, 2),
+                num_heads=(3, 6, 12, 24),
+                out_indices=[2, 3, 4],
+            )
+        else:
+            if isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.pop("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e69f2855cadcd3..4fb5dcc820496e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -34,7 +34,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
-    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -157,6 +156,7 @@ def backward(context, grad_output):
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+
 @dataclass
 class GroundingDinoDecoderOutput(ModelOutput):
     """
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index f7acbe4b703481..725c401c37480e 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -18,6 +18,7 @@
 
 from typing import List, Optional, Tuple, Union
 
+from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin

From 538e88f83b217327d42bd65ced809ea5d6d20fca Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 19:17:16 +0100
Subject: [PATCH 225/252] Fix copies

---
 .../models/grounding_dino/configuration_grounding_dino.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index a2ecac0448df89..ffaa64e580c020 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -165,9 +165,9 @@ class GroundingDinoConfig(PretrainedConfig):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `False`):
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):

From 18d3d63886afaf3a255640bd854caab4cb123c5a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 19:19:55 +0100
Subject: [PATCH 226/252] Removed inheritance from PreTrainedModel from
 GroundingDinoTextModel

---
 .../grounding_dino/modeling_grounding_dino.py | 57 ++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4fb5dcc820496e..e6599ddda026c7 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2029,11 +2029,11 @@ def forward(
         )
 
 
-class GroundingDinoTextModel(GroundingDinoPreTrainedModel):
+class GroundingDinoTextModel(nn.Module):
     """Grounding DINO text encoder, BERT-like."""
 
     def __init__(self, config: GroundingDinoTextConfig):
-        super().__init__(config)
+        super().__init__()
         self.config = config
 
         self.embeddings = GroundingDinoTextEmbeddings(config)
@@ -2052,6 +2052,59 @@ class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
+    
+    # Copied from transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask
+    def get_extended_attention_mask(
+        self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        if dtype is None:
+            dtype = self.dtype
+
+        if not (attention_mask.dim() == 2 and self.config.is_decoder):
+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+            if device is not None:
+                warnings.warn(
+                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+                )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
+                    input_shape, attention_mask, device
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+        return extended_attention_mask
 
     def forward(
         self,

From b4735d5f0498cf8933c01ce754c338d1dda11f63 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 19:38:06 +0100
Subject: [PATCH 227/252] Fixed GroundingDinoTextModel

---
 .../models/grounding_dino/modeling_grounding_dino.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e6599ddda026c7..385b89e7ca5fea 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -43,7 +43,7 @@
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import PreTrainedModel, get_parameter_dtype
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import is_accelerate_available, is_ninja_available, logging
 from ...utils.backbone_utils import load_backbone
@@ -2052,7 +2052,15 @@ class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
-    
+
+    @property
+    # Copied from transformers.modeling_utils.ModuleUtilsMixin.dtype
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+
     # Copied from transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask
     def get_extended_attention_mask(
         self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None

From 1cf5cf08f14e30b000e2f70b1a37e366f6c102a1 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 19:38:20 +0100
Subject: [PATCH 228/252] Fixed type of default backbone config

---
 .../models/grounding_dino/configuration_grounding_dino.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index ffaa64e580c020..423dfe139e63f6 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -336,8 +336,8 @@ def __init__(
                 window_size=7,
                 image_size=224,
                 embed_dim=96,
-                depths=(2, 2, 6, 2),
-                num_heads=(3, 6, 12, 24),
+                depths=[2, 2, 6, 2],
+                num_heads=[3, 6, 12, 24],
                 out_indices=[2, 3, 4],
             )
         else:

From 88c04676a4e693bc9ab838f317965308c684bb08 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 20:12:49 +0100
Subject: [PATCH 229/252] Fixed missing methods for GroundingDinoTextModel and
 Added timm support for GroundingDinoConvEncoder

---
 .../configuration_grounding_dino.py           |  9 +++
 .../grounding_dino/modeling_grounding_dino.py | 72 ++++++++++++++++---
 .../test_modeling_grounding_dino.py           |  2 +
 3 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 423dfe139e63f6..77958b6d4a7077 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -173,6 +173,11 @@ class GroundingDinoConfig(PretrainedConfig):
         backbone_kwargs (`dict`, *optional*):
             Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
             e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
         text_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
@@ -283,6 +288,8 @@ def __init__(
         use_pretrained_backbone=False,
         use_timm_backbone=False,
         backbone_kwargs=None,
+        num_channels=3,
+        dilation=False,
         text_config=None,
         num_queries=900,
         encoder_layers=6,
@@ -354,6 +361,8 @@ def __init__(
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
         self.backbone_kwargs = backbone_kwargs
+        self.num_channels = num_channels
+        self.dilation = dilation
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 385b89e7ca5fea..af8d91e60625a1 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -34,6 +34,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
+    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -156,6 +157,9 @@ def backward(context, grad_output):
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+if is_timm_available():
+    from timm import create_model
+
 
 @dataclass
 class GroundingDinoDecoderOutput(ModelOutput):
@@ -435,6 +439,7 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDino
 class GroundingDinoConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
@@ -447,23 +452,45 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = load_backbone(config)
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.channels
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
-        backbone_model_type = config.backbone_config.model_type
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
-                    parameter.requires_grad_(False)
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
 
+    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDino
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values).feature_maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
@@ -2061,7 +2088,36 @@ def dtype(self) -> torch.dtype:
         """
         return get_parameter_dtype(self)
 
-    # Copied from transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask
+    @staticmethod
+    # Copied from transformers.modeling_utils.ModuleUtilsMixin.create_extended_attention_mask_for_decoder
+    def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None):
+        if device is not None:
+            warnings.warn(
+                "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+        else:
+            device = attention_mask.device
+        batch_size, seq_length = input_shape
+        seq_ids = torch.arange(seq_length, device=device)
+        causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+        # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+        # causal and attention masks must have same type with pytorch version < 1.3
+        causal_mask = causal_mask.to(attention_mask.dtype)
+
+        if causal_mask.shape[1] < attention_mask.shape[1]:
+            prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+            causal_mask = torch.cat(
+                [
+                    torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
+                    causal_mask,
+                ],
+                axis=-1,
+            )
+
+        extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+        return extended_attention_mask
+
+    # Copied from transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask with ModuleUtilsMixin->self
     def get_extended_attention_mask(
         self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
     ) -> Tensor:
@@ -2095,7 +2151,7 @@ def get_extended_attention_mask(
             # - if the model is a decoder, apply a causal mask in addition to the padding mask
             # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
-                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
+                extended_attention_mask = self.create_extended_attention_mask_for_decoder(
                     input_shape, attention_mask, device
                 )
             else:
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 81175cafc49323..019e925d4d0e51 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -476,6 +476,8 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.use_timm_backbone = True
+        config.backbone_config = None
 
         for model_class in self.all_model_classes:
             model = model_class(config)

From 2d95044435a0dedcf7254dbaf608cd9356686bc1 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 23:11:29 +0100
Subject: [PATCH 230/252] Addressed comments

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 utils/check_repo.py                                             | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index af8d91e60625a1..d97dd26eff9e73 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3634,8 +3634,8 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor = None,
         token_type_ids: torch.LongTensor = None,
+        attention_mask: torch.LongTensor = None,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
         labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 7921969358cd2f..aa2b546f753c40 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -70,7 +70,6 @@
     "Pop2PianoStack",
     "SwitchTransformersStack",
     "TFDPRSpanPredictor",
-    "GroundingDinoTextModel",
     "MaskFormerSwinModel",
     "MaskFormerSwinPreTrainedModel",
     "BridgeTowerTextModel",

From 710c1bec942052f1ec1ff283ad969e034f5cf2b7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 10 Mar 2024 23:30:13 +0100
Subject: [PATCH 231/252] Addressed batched image processing tests

---
 .../test_image_processing_grounding_dino.py   | 306 ++++++++++++------
 1 file changed, 214 insertions(+), 92 deletions(-)

diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 23b0567f7dd40b..df69784bbb4523 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -237,61 +237,123 @@ def test_call_pytorch_with_coco_detection_annotations(self):
         self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
-    def test_call_pytorch_with_coco_detection_annotations_batched(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->GroundingDino
+    def test_batched_coco_detection_annotations(self):
+        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
+
         with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
             target = json.loads(f.read())
 
-        target = {"image_id": 39769, "annotations": target}
-
-        # Creating a batch of images and targets
-        image_batch = [image, image]
-        target_batch = [target, target]
+        annotations_0 = {"image_id": 39769, "annotations": target}
+        annotations_1 = {"image_id": 39769, "annotations": target}
+
+        # Adjust the bounding boxes for the resized image
+        w_0, h_0 = image_0.size
+        w_1, h_1 = image_1.size
+        for i in range(len(annotations_1["annotations"])):
+            coords = annotations_1["annotations"][i]["bbox"]
+            new_bbox = [
+                coords[0] * w_1 / w_0,
+                coords[1] * h_1 / h_0,
+                coords[2] * w_1 / w_0,
+                coords[3] * h_1 / h_0,
+            ]
+            annotations_1["annotations"][i]["bbox"] = new_bbox
+
+        images = [image_0, image_1]
+        annotations = [annotations_0, annotations_1]
 
-        # encode them
         image_processing = GroundingDinoImageProcessor()
-        encoding = image_processing(images=image_batch, annotations=target_batch, return_tensors="pt")
+        encoding = image_processing(
+            images=images,
+            annotations=annotations,
+            return_segmentation_masks=True,
+            return_tensors="pt",  # do_convert_annotations=True
+        )
 
-        # verify pixel values
-        expected_shape = torch.Size([2, 3, 800, 1066])
+        # Check the pixel values have been padded
+        postprocessed_height, postprocessed_width = 800, 1066
+        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-        self.assertTrue(torch.allclose(encoding["pixel_values"][1, 0, 0, :3], expected_slice, atol=1e-4))
+        # Check the bounding boxes have been adjusted for padded images
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+        expected_boxes_0 = torch.tensor(
+            [
+                [0.6879, 0.4609, 0.0755, 0.3691],
+                [0.2118, 0.3359, 0.2601, 0.1566],
+                [0.5011, 0.5000, 0.9979, 1.0000],
+                [0.5010, 0.5020, 0.9979, 0.9959],
+                [0.3284, 0.5944, 0.5884, 0.8112],
+                [0.8394, 0.5445, 0.3213, 0.9110],
+            ]
+        )
+        expected_boxes_1 = torch.tensor(
+            [
+                [0.4130, 0.2765, 0.0453, 0.2215],
+                [0.1272, 0.2016, 0.1561, 0.0940],
+                [0.3757, 0.4933, 0.7488, 0.9865],
+                [0.3759, 0.5002, 0.7492, 0.9955],
+                [0.1971, 0.5456, 0.3532, 0.8646],
+                [0.5790, 0.4115, 0.3430, 0.7161],
+            ]
+        )
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
 
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["size"], expected_size))
+        # Check the masks have also been padded
+        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
+        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
+
+        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+        # format and not in the range [0, 1]
+        encoding = image_processing(
+            images=images,
+            annotations=annotations,
+            return_segmentation_masks=True,
+            do_convert_annotations=False,
+            return_tensors="pt",
+        )
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+        # Convert to absolute coordinates
+        unnormalized_boxes_0 = torch.vstack(
+            [
+                expected_boxes_0[:, 0] * postprocessed_width,
+                expected_boxes_0[:, 1] * postprocessed_height,
+                expected_boxes_0[:, 2] * postprocessed_width,
+                expected_boxes_0[:, 3] * postprocessed_height,
+            ]
+        ).T
+        unnormalized_boxes_1 = torch.vstack(
+            [
+                expected_boxes_1[:, 0] * postprocessed_width,
+                expected_boxes_1[:, 1] * postprocessed_height,
+                expected_boxes_1[:, 2] * postprocessed_width,
+                expected_boxes_1[:, 3] * postprocessed_height,
+            ]
+        ).T
+        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+        expected_boxes_0 = torch.vstack(
+            [
+                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+            ]
+        ).T
+        expected_boxes_1 = torch.vstack(
+            [
+                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+            ]
+        ).T
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
 
     @slow
     # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino
@@ -344,65 +406,125 @@ def test_call_pytorch_with_coco_panoptic_annotations(self):
         self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
 
     @slow
-    def test_call_pytorch_with_coco_panoptic_annotations_batched(self):
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->GroundingDino
+    def test_batched_coco_panoptic_annotations(self):
         # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
+
         with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
             target = json.loads(f.read())
 
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        w_0, h_0 = image_0.size
+        w_1, h_1 = image_1.size
+        for i in range(len(annotation_1["segments_info"])):
+            coords = annotation_1["segments_info"][i]["bbox"]
+            new_bbox = [
+                coords[0] * w_1 / w_0,
+                coords[1] * h_1 / h_0,
+                coords[2] * w_1 / w_0,
+                coords[3] * h_1 / h_0,
+            ]
+            annotation_1["segments_info"][i]["bbox"] = new_bbox
 
         masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
 
-        image_batch = [image, image]
-        target_batch = [target, target]
+        images = [image_0, image_1]
+        annotations = [annotation_0, annotation_1]
 
         # encode them
         image_processing = GroundingDinoImageProcessor(format="coco_panoptic")
         encoding = image_processing(
-            images=image_batch, annotations=target_batch, masks_path=masks_path, return_tensors="pt"
+            images=images,
+            annotations=annotations,
+            masks_path=masks_path,
+            return_tensors="pt",
+            return_segmentation_masks=True,
         )
 
-        # verify pixel values
-        expected_shape = torch.Size([2, 3, 800, 1066])
+        # Check the pixel values have been padded
+        postprocessed_height, postprocessed_width = 800, 1066
+        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
         self.assertEqual(encoding["pixel_values"].shape, expected_shape)
 
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-        self.assertTrue(torch.allclose(encoding["pixel_values"][1, 0, 0, :3], expected_slice, atol=1e-4))
+        # Check the bounding boxes have been adjusted for padded images
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+        expected_boxes_0 = torch.tensor(
+            [
+                [0.2625, 0.5437, 0.4688, 0.8625],
+                [0.7719, 0.4104, 0.4531, 0.7125],
+                [0.5000, 0.4927, 0.9969, 0.9854],
+                [0.1688, 0.2000, 0.2063, 0.0917],
+                [0.5492, 0.2760, 0.0578, 0.2187],
+                [0.4992, 0.4990, 0.9984, 0.9979],
+            ]
+        )
+        expected_boxes_1 = torch.tensor(
+            [
+                [0.1576, 0.3262, 0.2814, 0.5175],
+                [0.4634, 0.2463, 0.2720, 0.4275],
+                [0.3002, 0.2956, 0.5985, 0.5913],
+                [0.1013, 0.1200, 0.1238, 0.0550],
+                [0.3297, 0.1656, 0.0347, 0.1312],
+                [0.2997, 0.2994, 0.5994, 0.5987],
+            ]
+        )
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
 
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        self.assertEqual(encoding["labels"][1]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["size"], expected_size))
+        # Check the masks have also been padded
+        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
+        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
+
+        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+        # format and not in the range [0, 1]
+        encoding = image_processing(
+            images=images,
+            annotations=annotations,
+            masks_path=masks_path,
+            return_segmentation_masks=True,
+            do_convert_annotations=False,
+            return_tensors="pt",
+        )
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+        # Convert to absolute coordinates
+        unnormalized_boxes_0 = torch.vstack(
+            [
+                expected_boxes_0[:, 0] * postprocessed_width,
+                expected_boxes_0[:, 1] * postprocessed_height,
+                expected_boxes_0[:, 2] * postprocessed_width,
+                expected_boxes_0[:, 3] * postprocessed_height,
+            ]
+        ).T
+        unnormalized_boxes_1 = torch.vstack(
+            [
+                expected_boxes_1[:, 0] * postprocessed_width,
+                expected_boxes_1[:, 1] * postprocessed_height,
+                expected_boxes_1[:, 2] * postprocessed_width,
+                expected_boxes_1[:, 3] * postprocessed_height,
+            ]
+        ).T
+        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+        expected_boxes_0 = torch.vstack(
+            [
+                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+            ]
+        ).T
+        expected_boxes_1 = torch.vstack(
+            [
+                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+            ]
+        ).T
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
+        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))

From 06a59b2567860b8b5b7a483bebb17a71bc9893be Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 11 Mar 2024 00:03:41 +0100
Subject: [PATCH 232/252] Addressed zero shot test comment

---
 .../test_modeling_grounding_dino.py           |  2 +-
 ...st_pipelines_zero_shot_object_detection.py | 25 -------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 019e925d4d0e51..8c12822de957bc 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -208,7 +208,7 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
     test_head_masking = False
     test_missing_keys = False
     pipeline_model_mapping = (
-        {"image-feature-extraction": GroundingDinoModel, "object-detection": GroundingDinoForObjectDetection}
+        {"image-feature-extraction": GroundingDinoModel, "zero-shot-object-detection": GroundingDinoForObjectDetection}
         if is_torch_available()
         else {}
     )
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index e1a0604a141ee8..c8b424483fa20e 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -227,28 +227,3 @@ def test_top_k(self):
                 {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
             ],
         )
-
-    @require_torch
-    @slow
-    def test_grounding_dino(self):
-        object_detector = pipeline("zero-shot-object-detection", model="IDEA-Research/grounding-dino-tiny")
-
-        outputs = object_detector(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            candidate_labels=["a cat."],
-        )
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.4526, "label": "a cat.", "box": {"xmin": 344, "ymin": 23, "xmax": 637, "ymax": 373}},
-                {"score": 0.4082, "label": "a cat.", "box": {"xmin": 11, "ymin": 51, "xmax": 316, "ymax": 472}},
-                {"score": 0.1617, "label": "a cat.", "box": {"xmin": 357, "ymin": 37, "xmax": 552, "ymax": 362}},
-                {"score": 0.1299, "label": "a cat.", "box": {"xmin": 330, "ymin": 13, "xmax": 635, "ymax": 445}},
-                {"score": 0.1279, "label": "a cat.", "box": {"xmin": 25, "ymin": 54, "xmax": 315, "ymax": 366}},
-                {"score": 0.1267, "label": "a cat.", "box": {"xmin": 41, "ymin": 59, "xmax": 306, "ymax": 402}},
-                {"score": 0.1098, "label": "a cat.", "box": {"xmin": 279, "ymin": 12, "xmax": 636, "ymax": 408}},
-                {"score": 0.1063, "label": "a cat.", "box": {"xmin": 353, "ymin": 39, "xmax": 616, "ymax": 297}},
-                {"score": 0.1043, "label": "a cat.", "box": {"xmin": 351, "ymin": 26, "xmax": 550, "ymax": 458}},
-            ],
-        )

From 2de4e1522870c954ae97e2233b50302c987ddb0e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 11 Mar 2024 00:17:38 +0100
Subject: [PATCH 233/252] Addressed tip comment

---
 docs/source/en/model_doc/grounding-dino.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index d259259ef5bdf3..9e7928cfc3571e 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -36,7 +36,7 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 
 - One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model.
 - To separate classes in the text use a period e.g. "a cat. a dog."
-- When using multiple classes, use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs.
+- When using multiple classes (e.g. `"a cat. a dog."`), use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs. Since, `post_process_object_detection` from [`GroundingDinoImageProcessor`] returned labels represent he indices from the model dimension where prob > threshold.
 
 Here's how to use the model for zero-shot object detection:
 

From 4b9c9ade76fa9dbe1ece413f005eb2d4561b7885 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:29:31 +0100
Subject: [PATCH 234/252] Removed GroundingDinoTextModel from check_repo

---
 utils/check_repo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index aa2b546f753c40..44c99194f309a2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -241,7 +241,6 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
-    "GroundingDinoTextModel",
     "InstructBlipVisionModel",
     "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",

From 4df56a4773ea71d3cdca5189e1f7eba743fbe858 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:30:00 +0100
Subject: [PATCH 235/252] Removed inplace masking

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d97dd26eff9e73..62196ee349dd9c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1496,7 +1496,7 @@ def forward(
         text_token_mask: torch.BoolTensor,
     ) -> torch.FloatTensor:
         output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
-        output.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
+        output = output.masked_fill(~text_token_mask[:, None, :], float("-inf"))
 
         # padding to max_text_len
         new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device)

From 0e0ae3c2558659ad0273d5c0fee21c48a6b73544 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:39:29 +0100
Subject: [PATCH 236/252] Addressed comments

---
 .../configuration_grounding_dino.py           | 15 +++++-----
 .../grounding_dino/modeling_grounding_dino.py | 28 +++++++++----------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 77958b6d4a7077..0c37ccd5e812c6 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -331,8 +331,10 @@ def __init__(
         layer_norm_eps=1e-5,
         **kwargs,
     ):
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
+            )
 
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
@@ -347,11 +349,10 @@ def __init__(
                 num_heads=[3, 6, 12, 24],
                 out_indices=[2, 3, 4],
             )
-        else:
-            if isinstance(backbone_config, dict):
-                backbone_model_type = backbone_config.pop("model_type")
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
 
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 62196ee349dd9c..0e00db10f7c837 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -51,6 +51,20 @@
 from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
 
 
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_timm_available():
+    from timm import create_model
+
+
 logger = logging.get_logger(__name__)
 
 MultiScaleDeformableAttention = None
@@ -87,14 +101,6 @@ def load_cuda_kernels():
     )
 
 
-if is_vision_available():
-    from transformers.image_transforms import center_to_corners_format
-
-if is_accelerate_available():
-    from accelerate import PartialState
-    from accelerate.utils import reduce
-
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
@@ -154,12 +160,6 @@ def backward(context, grad_output):
     # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino
 ]
 
-if is_scipy_available():
-    from scipy.optimize import linear_sum_assignment
-
-if is_timm_available():
-    from timm import create_model
-
 
 @dataclass
 class GroundingDinoDecoderOutput(ModelOutput):

From e8222f36316c9c408a23197ba621bc212a6b14d1 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:49:04 +0100
Subject: [PATCH 237/252] Addressed comments

---
 .../grounding_dino/modeling_grounding_dino.py | 34 ++++++-------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 0e00db10f7c837..bc6118bf823532 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -62,7 +62,7 @@
     from scipy.optimize import linear_sum_assignment
 
 if is_timm_available():
-    from timm import create_model
+    pass
 
 
 logger = logging.get_logger(__name__)
@@ -453,21 +453,7 @@ def __init__(self, config):
 
         self.config = config
 
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
@@ -2259,6 +2245,12 @@ def forward(
 
             Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
 
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
         attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -2267,12 +2259,6 @@ def forward(
 
             [What are attention masks?](../glossary#attention-mask)
 
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
-
-            [What are token type IDs?](../glossary#token-type-ids)
-
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
 
@@ -3638,10 +3624,10 @@ def forward(
         attention_mask: torch.LongTensor = None,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
-        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
     ):
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
@@ -3689,8 +3675,8 @@ def forward(
         outputs = self.model(
             pixel_values=pixel_values,
             input_ids=input_ids,
-            attention_mask=attention_mask,
             token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
             pixel_mask=pixel_mask,
             encoder_outputs=encoder_outputs,
             output_attentions=output_attentions,

From 6cab49a522abe552b1dd7ca99a89d6ecd5032b0d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:56:05 +0100
Subject: [PATCH 238/252] Addressed comments

---
 .../grounding_dino/configuration_grounding_dino.py       | 9 ---------
 .../models/grounding_dino/modeling_grounding_dino.py     | 1 -
 2 files changed, 10 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 0c37ccd5e812c6..c16d93ecf00d3a 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -173,11 +173,6 @@ class GroundingDinoConfig(PretrainedConfig):
         backbone_kwargs (`dict`, *optional*):
             Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
             e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of input channels.
-        dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
-            `use_timm_backbone` = `True`.
         text_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
             The configuration of the text backbone model. Should be a BERT-like config.
         num_queries (`int`, *optional*, defaults to 900):
@@ -288,8 +283,6 @@ def __init__(
         use_pretrained_backbone=False,
         use_timm_backbone=False,
         backbone_kwargs=None,
-        num_channels=3,
-        dilation=False,
         text_config=None,
         num_queries=900,
         encoder_layers=6,
@@ -362,8 +355,6 @@ def __init__(
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
         self.backbone_kwargs = backbone_kwargs
-        self.num_channels = num_channels
-        self.dilation = dilation
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index bc6118bf823532..aa5034ffe7f08a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -439,7 +439,6 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDino
 class GroundingDinoConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.

From 37b272f542280ad8f026a88405c8d13f4b322d4a Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Fri, 22 Mar 2024 21:58:43 +0100
Subject: [PATCH 239/252] Fix copies

---
 README_de.md    | 1 +
 README_fr.md    | 1 +
 README_hd.md    | 2 +-
 README_pt-br.md | 1 +
 README_ru.md    | 1 +
 README_te.md    | 1 +
 README_vi.md    | 1 +
 7 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/README_de.md b/README_de.md
index 2fdd6412741574..8e3a27aba6f19e 100644
--- a/README_de.md
+++ b/README_de.md
@@ -385,6 +385,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_fr.md b/README_fr.md
index 68993116bd997d..8e431006927fcc 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -383,6 +383,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (de BigCode) a été publié dans l'article [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) par Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** a été publié dans le dépôt [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) par Toshiyuki Sakamoto (tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (de Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) publié dans l'article [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) parShilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_hd.md b/README_hd.md
index 8d04d0a6d5596c..cda25f4d57fa91 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -337,7 +337,7 @@ conda install conda-forge::transformers
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others से) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. द्वाराअनुसंधान पत्र [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) के साथ जारी किया गया
-1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
+1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
diff --git a/README_pt-br.md b/README_pt-br.md
index 9c36ea5744e2b2..e951fbdda9f60c 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -394,6 +394,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_ru.md b/README_ru.md
index 646c46fc752bc1..6b9e2ded688dfb 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -384,6 +384,7 @@ conda install conda-forge::transformers
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_te.md b/README_te.md
index 0c8dd51f2b48d4..1bfe1aa89f0d66 100644
--- a/README_te.md
+++ b/README_te.md
@@ -386,6 +386,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
diff --git a/README_vi.md b/README_vi.md
index face3bf7d1fae8..094bb45ea65b24 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -385,6 +385,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (từ BigCode) được phát hành với bài báo [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (từ Microsoft) được phát hành với bài báo [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (từ Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) được phát hành với bài báo [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (từ UCSD, NVIDIA) được phát hành với bài báo [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (từ Allegro.pl, AGH University of Science and Technology) được phát hành với bài báo [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) được phát hành với bài báo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.

From d6966ce8cb7e057e0ee919cbc31e23f9313694b7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 23 Mar 2024 11:54:18 +0100
Subject: [PATCH 240/252] Fixing timm test

---
 .../grounding_dino/modeling_grounding_dino.py       | 13 +++++++++++--
 .../grounding_dino/test_modeling_grounding_dino.py  |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index aa5034ffe7f08a..f87e5d07b80cb1 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -62,7 +62,7 @@
     from scipy.optimize import linear_sum_assignment
 
 if is_timm_available():
-    pass
+    from timm import create_model
 
 
 logger = logging.get_logger(__name__)
@@ -452,7 +452,16 @@ def __init__(self, config):
 
         self.config = config
 
-        backbone = load_backbone(config)
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                **config.backbone_kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 8c12822de957bc..42bdf63ae1b166 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -478,6 +478,7 @@ def test_different_timm_backbone(self):
         config.backbone = "tf_mobilenetv3_small_075"
         config.use_timm_backbone = True
         config.backbone_config = None
+        config.backbone_kwargs = {"in_chans": 3, "out_indices": (2, 3, 4)}
 
         for model_class in self.all_model_classes:
             model = model_class(config)

From 1a94461a40d243483faf86fd94b65705a0a52d74 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 23 Mar 2024 14:06:03 +0100
Subject: [PATCH 241/252] Fixed batching equivalence test

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 42bdf63ae1b166..162a8f38036fb7 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -71,7 +71,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         num_queries=2,
         num_channels=3,
-        image_size=10,
+        image_size=196,
         n_targets=8,
         num_labels=3,
         num_feature_levels=4,
@@ -137,7 +137,7 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         swin_config = SwinConfig(
             window_size=7,
-            embed_dim=16,
+            embed_dim=8,
             depths=[1, 1, 1, 1],
             num_heads=[1, 1, 1, 1],
             image_size=self.image_size,

From a584f6511f9b47f92683bf2792632715846567e7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sun, 24 Mar 2024 14:52:48 +0100
Subject: [PATCH 242/252] Update docs/source/en/model_doc/grounding-dino.md

Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com>
---
 docs/source/en/model_doc/grounding-dino.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 9e7928cfc3571e..2037078a97e738 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -66,7 +66,7 @@ with torch.no_grad():
 results = processor.post_process_grounded_object_detection(
     outputs,
     inputs.input_ids,
-    bbox_threshold=0.4
+    bbox_threshold=0.4,
     text_threshold=0.3,
     target_sizes=[image.size[::-1]]
 )

From a9dfee3fa497a0292d6a133fa5e84bec36d231b1 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sun, 24 Mar 2024 14:52:57 +0100
Subject: [PATCH 243/252] Update docs/source/en/model_doc/grounding-dino.md

Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com>
---
 docs/source/en/model_doc/grounding-dino.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 2037078a97e738..30e20e2044b598 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -50,7 +50,7 @@ from transformers import AutoProcessor, AutoModelForObjectDetection,
 model_id = "IDEA-Research/grounding-dino-tiny"
 
 processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 
 def load_image(url):
     return Image.open(requests.get(url, stream=True).raw)

From 6f13fbb5f46a8c949a02c5c087de104fdf254f67 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Sun, 24 Mar 2024 14:53:16 +0100
Subject: [PATCH 244/252] Update docs/source/en/model_doc/grounding-dino.md

Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com>
---
 docs/source/en/model_doc/grounding-dino.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 30e20e2044b598..3413735cf3ea79 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -45,7 +45,7 @@ import requests
 
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForObjectDetection, 
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
 
 model_id = "IDEA-Research/grounding-dino-tiny"
 

From a1e9ff0879a69c494992bd2096c1f12786d35a93 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 9 Apr 2024 00:12:25 +0200
Subject: [PATCH 245/252] Addressed more comments

---
 docs/source/en/model_doc/grounding-dino.md    |   4 -
 src/transformers/__init__.py                  |   2 -
 .../models/grounding_dino/__init__.py         |   2 -
 .../configuration_grounding_dino.py           | 142 +---
 .../grounding_dino/modeling_grounding_dino.py | 692 +-----------------
 .../test_modeling_grounding_dino.py           |  12 +-
 6 files changed, 25 insertions(+), 829 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 3413735cf3ea79..7f4d72399da80d 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -84,10 +84,6 @@ results = processor.post_process_grounded_object_detection(
 [[autodoc]] GroundingDinoProcessor
     - post_process_grounded_object_detection
 
-## GroundingDinoTextConfig
-
-[[autodoc]] GroundingDinoTextConfig
-
 ## GroundingDinoConfig
 
 [[autodoc]] GroundingDinoConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0fc58aa8f3f0a1..ff6c04155f5a5f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -493,7 +493,6 @@
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroundingDinoConfig",
         "GroundingDinoProcessor",
-        "GroundingDinoTextConfig",
     ],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -5389,7 +5388,6 @@
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroundingDinoConfig,
         GroundingDinoProcessor,
-        GroundingDinoTextConfig,
     )
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py
index bc046a418d9051..3b0f792068c5f0 100644
--- a/src/transformers/models/grounding_dino/__init__.py
+++ b/src/transformers/models/grounding_dino/__init__.py
@@ -21,7 +21,6 @@
     "configuration_grounding_dino": [
         "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroundingDinoConfig",
-        "GroundingDinoTextConfig",
     ],
     "processing_grounding_dino": ["GroundingDinoProcessor"],
 }
@@ -52,7 +51,6 @@
     from .configuration_grounding_dino import (
         GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroundingDinoConfig,
-        GroundingDinoTextConfig,
     )
     from .processing_grounding_dino import GroundingDinoProcessor
 
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index c16d93ecf00d3a..4c88353459ceab 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Grounding DINO model configuration"""
-import os
-from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -28,126 +26,6 @@
 }
 
 
-class GroundingDinoTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`GroundingDinoTextModel`]. It is used to
-    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the BERT
-    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GroundingDinoTextModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GroundingDinoTextModel`].
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            The index of the padding token in the token vocabulary.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-
-    Examples:
-
-    ```python
-    >>> from transformers import GroundingDinoTextConfig, GroundingDinoConfig, GroundingDinoForObjectDetection
-
-    >>> # Initializing a BERT bert-base-uncased style text configuration
-    >>> text_config = GroundingDinoTextConfig()
-
-    >>> # Initializing a Grounding DINO configuration with the text configuration
-    >>> config = GroundingDinoConfig(text_config=text_config)
-
-    >>> # Initializing a model from the ground-up with a config
-    >>> model = GroundingDinoForObjectDetection(config)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "grounding-dino-text-prenet"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        position_embedding_type="absolute",
-        init_std=0.02,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.init_std = init_std
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from CLIPSegConfig
-        if config_dict.get("model_type") == "grounding-dino":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
 class GroundingDinoConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a
@@ -173,8 +51,8 @@ class GroundingDinoConfig(PretrainedConfig):
         backbone_kwargs (`dict`, *optional*):
             Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
             e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
-        text_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`):
-            The configuration of the text backbone model. Should be a BERT-like config.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`GroundingDinoModel`] can detect in a single image.
@@ -350,6 +228,10 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
+
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
@@ -384,16 +266,8 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        if text_config is None:
-            self.text_config = GroundingDinoTextConfig()
-        elif isinstance(text_config, dict):
-            self.text_config = GroundingDinoTextConfig(**text_config)
-        elif isinstance(text_config, GroundingDinoTextConfig):
-            self.text_config = text_config
-        else:
-            raise ValueError(
-                f"`text_config` should be either a `dict` or an instance of `GroundingDinoTextConfig`. Received {type(text_config)} instead."
-            )
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "bert"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
         self.max_text_len = max_text_len
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index f87e5d07b80cb1..fb3b0fe47cb449 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -40,15 +40,12 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-)
-from ...modeling_utils import PreTrainedModel, get_parameter_dtype
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
 from ...utils import is_accelerate_available, is_ninja_available, logging
 from ...utils.backbone_utils import load_backbone
-from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig
+from ..auto import AutoModel
+from .configuration_grounding_dino import GroundingDinoConfig
 
 
 if is_vision_available():
@@ -796,7 +793,7 @@ def forward(
         position_embeddings: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
         """Text self-attention to enhance projection of text features generated by
-        the text encoder (GroundingDinoTextModel) within GroundingDinoEncoderLayer
+        the text encoder (AutoModel based on text_config) within GroundingDinoEncoderLayer
 
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
@@ -1499,25 +1496,6 @@ def forward(
         return new_output
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
-class GroundingDinoClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
 class GroundingDinoPreTrainedModel(PreTrainedModel):
     config_class = GroundingDinoConfig
     base_model_prefix = "model"
@@ -1573,656 +1551,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText
-class GroundingDinoTextEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values_length: int = 0,
-    ) -> torch.Tensor:
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText
-class GroundingDinoTextSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        use_cache = past_key_value is not None
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
-            if use_cache:
-                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-                    -1, 1
-                )
-            else:
-                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText
-class GroundingDinoTextSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText
-class GroundingDinoTextAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type)
-        self.output = GroundingDinoTextSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText
-class GroundingDinoTextIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText
-class GroundingDinoTextOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText
-class GroundingDinoTextLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = GroundingDinoTextAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
-            self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute")
-        self.intermediate = GroundingDinoTextIntermediate(config)
-        self.output = GroundingDinoTextOutput(config)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`"
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText
-class GroundingDinoTextEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class GroundingDinoTextModel(nn.Module):
-    """Grounding DINO text encoder, BERT-like."""
-
-    def __init__(self, config: GroundingDinoTextConfig):
-        super().__init__()
-        self.config = config
-
-        self.embeddings = GroundingDinoTextEmbeddings(config)
-        self.encoder = GroundingDinoTextEncoder(config)
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @property
-    # Copied from transformers.modeling_utils.ModuleUtilsMixin.dtype
-    def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-
-    @staticmethod
-    # Copied from transformers.modeling_utils.ModuleUtilsMixin.create_extended_attention_mask_for_decoder
-    def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None):
-        if device is not None:
-            warnings.warn(
-                "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
-            )
-        else:
-            device = attention_mask.device
-        batch_size, seq_length = input_shape
-        seq_ids = torch.arange(seq_length, device=device)
-        causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-        # in case past_key_values are used we need to add a prefix ones mask to the causal mask
-        # causal and attention masks must have same type with pytorch version < 1.3
-        causal_mask = causal_mask.to(attention_mask.dtype)
-
-        if causal_mask.shape[1] < attention_mask.shape[1]:
-            prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
-            causal_mask = torch.cat(
-                [
-                    torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
-                    causal_mask,
-                ],
-                axis=-1,
-            )
-
-        extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-        return extended_attention_mask
-
-    # Copied from transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask with ModuleUtilsMixin->self
-    def get_extended_attention_mask(
-        self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
-    ) -> Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-
-        Arguments:
-            attention_mask (`torch.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (`Tuple[int]`):
-                The shape of the input to the model.
-
-        Returns:
-            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
-        """
-        if dtype is None:
-            dtype = self.dtype
-
-        if not (attention_mask.dim() == 2 and self.config.is_decoder):
-            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
-            if device is not None:
-                warnings.warn(
-                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
-                )
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                extended_attention_mask = self.create_extended_attention_mask_for_decoder(
-                    input_shape, attention_mask, device
-                )
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
-        return extended_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_shape = input_ids.shape
-        batch_size, seq_length = input_shape
-        device = input_ids.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-
-        if not return_dict:
-            return (sequence_output,) + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=sequence_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
 GROUNDING_DINO_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -2776,7 +2104,7 @@ def __init__(self, config: GroundingDinoConfig):
             )
 
         # Create text backbone
-        self.text_backbone = GroundingDinoTextModel(config.text_config)
+        self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False)
         self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
@@ -3584,15 +2912,12 @@ def forward(self, outputs, targets):
 )
 class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"]
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"model\.decoder\.bbox_embed\.[0-9]\d*"]
 
     def __init__(self, config: GroundingDinoConfig):
         super().__init__(config)
 
-        # Deformable DETR encoder-decoder model
         self.model = GroundingDinoModel(config)
-
-        # Detection heads on top
         _class_embed = GroundingDinoContrastiveEmbedding(config)
 
         if config.decoder_bbox_embed_share:
@@ -3607,8 +2932,9 @@ def __init__(self, config: GroundingDinoConfig):
                 )
                 self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)])
         self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
-        # hack implementation for two-stage
+        # hack for box-refinement
         self.model.decoder.bbox_embed = self.bbox_embed
+        # hack implementation for two-stage
         self.model.decoder.class_embed = self.class_embed
 
         # Initialize weights and apply final processing
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 162a8f38036fb7..d3570aa2bee7f1 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -22,7 +22,6 @@
 
 from transformers import (
     GroundingDinoConfig,
-    GroundingDinoTextConfig,
     SwinConfig,
     is_torch_available,
     is_vision_available,
@@ -144,9 +143,14 @@ def get_config(self):
             out_features=["stage2", "stage3", "stage4"],
             out_indices=[2, 3, 4],
         )
-        text_backbone = GroundingDinoTextConfig(
-            hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8
-        )
+        text_backbone = {
+            "hidden_size": 8,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 2,
+            "intermediate_size": 8,
+            "max_position_embeddings": 8,
+            "model_type": "bert",
+        }
         return GroundingDinoConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,

From 38a2e9717dd4c8327d0ac625e11d6b345959e073 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 9 Apr 2024 10:44:58 +0200
Subject: [PATCH 246/252] Added a new comment

---
 .../models/grounding_dino/modeling_grounding_dino.py             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index fb3b0fe47cb449..7f9149de9155e4 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2912,6 +2912,7 @@ def forward(self, outputs, targets):
 )
 class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    # the bbox_embed in the decoder are all clones though
     _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"model\.decoder\.bbox_embed\.[0-9]\d*"]
 
     def __init__(self, config: GroundingDinoConfig):

From e9633b4cd31892230d4304c23c106f2e497091c4 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 9 Apr 2024 10:59:55 +0200
Subject: [PATCH 247/252] Reduced image size

---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index d3570aa2bee7f1..7304145e78d67e 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -70,7 +70,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         num_queries=2,
         num_channels=3,
-        image_size=196,
+        image_size=98,
         n_targets=8,
         num_labels=3,
         num_feature_levels=4,

From 89e070f61b339a58f709f09630b7e6b3a7e6b20c Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 9 Apr 2024 14:26:48 +0200
Subject: [PATCH 248/252] Addressed more comments

---
 docs/source/en/model_doc/grounding-dino.md                | 8 +++-----
 .../models/grounding_dino/test_modeling_grounding_dino.py | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 7f4d72399da80d..26c23ebbb38454 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -36,7 +36,7 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 
 - One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model.
 - To separate classes in the text use a period e.g. "a cat. a dog."
-- When using multiple classes (e.g. `"a cat. a dog."`), use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs. Since, `post_process_object_detection` from [`GroundingDinoImageProcessor`] returned labels represent he indices from the model dimension where prob > threshold.
+- When using multiple classes (e.g. `"a cat. a dog."`), use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs. Since, the labels returned from `post_process_object_detection` represent the indices from the model dimension where prob > threshold.
 
 Here's how to use the model for zero-shot object detection:
 
@@ -52,10 +52,8 @@ model_id = "IDEA-Research/grounding-dino-tiny"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 
-def load_image(url):
-    return Image.open(requests.get(url, stream=True).raw)
-
-image = load_image('http://images.cocodataset.org/val2017/000000039769.jpg')
+image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(image_url, stream=True).raw)
 # Check for cats and remote controls
 text = "a cat. a remote control"
 
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 7304145e78d67e..42486f92da9746 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -58,7 +58,7 @@ class GroundingDinoModelTester:
     def __init__(
         self,
         parent,
-        batch_size=8,
+        batch_size=4,
         is_training=True,
         use_labels=True,
         hidden_size=32,

From a961ab744954431ba4f33bd16f760e16ba0899e5 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 10 Apr 2024 10:30:06 +0200
Subject: [PATCH 249/252] Nits

---
 docs/source/en/model_doc/grounding-dino.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 26c23ebbb38454..4203933759f5ff 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -57,14 +57,14 @@ image = Image.open(requests.get(image_url, stream=True).raw)
 # Check for cats and remote controls
 text = "a cat. a remote control"
 
-inputs = processor(images=image, text=text, return_tensors="pt")
+inputs = processor(images=image, text=text, return_tensors="pt").to(device)
 with torch.no_grad():
     outputs = model(**inputs)
 
 results = processor.post_process_grounded_object_detection(
     outputs,
     inputs.input_ids,
-    bbox_threshold=0.4,
+    box_threshold=0.4,
     text_threshold=0.3,
     target_sizes=[image.size[::-1]]
 )

From f945c7a241bcd5f01903acc8662302d07f93fcaa Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 10 Apr 2024 17:25:16 +0200
Subject: [PATCH 250/252] Nits

---
 docs/source/en/model_doc/grounding-dino.md                      | 2 +-
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 4203933759f5ff..3c6bd6fce06920 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -55,7 +55,7 @@ model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(image_url, stream=True).raw)
 # Check for cats and remote controls
-text = "a cat. a remote control"
+text = "a cat. a remote control."
 
 inputs = processor(images=image, text=text, return_tensors="pt").to(device)
 with torch.no_grad():
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 725c401c37480e..0b3abd9d7f4bb4 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -40,7 +40,7 @@ def get_phrases_from_posmap(posmaps, input_ids):
             A tensor of token ids.
     """
     left_idx = 0
-    right_idx = 255
+    right_idx = posmaps.shape[-1] - 1
 
     # Avoiding altering the input tensor
     posmaps = posmaps.clone()

From b0891ca40bda66715da891137e9bbb498e7c4dea Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 10 Apr 2024 18:15:12 +0200
Subject: [PATCH 251/252] Changed the way text_config is initialized

---
 .../grounding_dino/configuration_grounding_dino.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 4c88353459ceab..fe683035039600 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -51,8 +51,8 @@ class GroundingDinoConfig(PretrainedConfig):
         backbone_kwargs (`dict`, *optional*):
             Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
             e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
+            The config object or dictionary of the text backbone.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`GroundingDinoModel`] can detect in a single image.
@@ -266,9 +266,15 @@ def __init__(
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
-        text_model_type = text_config["model_type"] if "model_type" in text_config else "bert"
-        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "bert"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["bert"]()
+
+        self.text_config = text_config
         self.max_text_len = max_text_len
+
         # Text Enhancer
         self.text_enhancer_dropout = text_enhancer_dropout
         # Fusion

From c630a9c4dd80781b950d2aace8c47600e0e22eb3 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Wed, 10 Apr 2024 19:24:04 +0200
Subject: [PATCH 252/252] Update
 src/transformers/models/grounding_dino/processing_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/processing_grounding_dino.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 0b3abd9d7f4bb4..44b99811d931ce 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -180,9 +180,9 @@ def post_process_grounded_object_detection(
                 Raw outputs of the model.
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The token ids of the input text.
-            box_threshold (`float`, *optional*):
+            box_threshold (`float`, *optional*, defaults to 0.25):
                 Score threshold to keep object detection predictions.
-            text_threshold (`float`, *optional*):
+            text_threshold (`float`, *optional*, defaults to 0.25):
                 Score threshold to keep text detection predictions.
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size