From 92e5ce2a65c94752caa07eaa4880c3f3832ac2de Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 22 Aug 2023 18:50:32 -0300 Subject: [PATCH 001/252] Copied deformable detr --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/grounding-dino.md | 48 + src/transformers/__init__.py | 16 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/feature_extraction_auto.py | 1 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/grounding_dino/__init__.py | 57 + .../configuration_grounding_dino.py | 262 ++ .../convert_grounding_dino_to_pytorch.py | 237 ++ .../models/grounding_dino/load_custom.py | 49 + .../grounding_dino/modeling_grounding_dino.py | 2513 +++++++++++++++++ tests/models/grounding_dino/__init__.py | 0 .../test_modeling_grounding_dino.py | 673 +++++ 15 files changed, 3865 insertions(+) create mode 100644 docs/source/en/model_doc/grounding-dino.md create mode 100644 src/transformers/models/grounding_dino/__init__.py create mode 100644 src/transformers/models/grounding_dino/configuration_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py create mode 100644 src/transformers/models/grounding_dino/load_custom.py create mode 100644 src/transformers/models/grounding_dino/modeling_grounding_dino.py create mode 100644 tests/models/grounding_dino/__init__.py create mode 100644 tests/models/grounding_dino/test_modeling_grounding_dino.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d054faf0712fe7..0b5e0434e7bb2f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -506,6 +506,8 @@ title: FocalNet - local: model_doc/glpn title: GLPN + - local: model_doc/grounding-dino + title: Grounding DINO - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md new file mode 100644 index 00000000000000..161a90609174b3 --- /dev/null +++ b/docs/source/en/model_doc/grounding-dino.md @@ -0,0 +1,48 @@ + + +# Grounding DINO + +## Overview + +The Grounding DINO model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## GroundingDINOConfig + +[[autodoc]] GroundingDINOConfig + +## GroundingDINOModel + +[[autodoc]] GroundingDINOModel + - forward + +## GroundingDINOForObjectDetection + +[[autodoc]] GroundingDINOForObjectDetection + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9b95aadffccc6f..aa2f7837b4ce67 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -274,6 +274,7 @@ "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"], "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"], "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], "models.deprecated": [], "models.deprecated.bort": [], @@ -1541,6 +1542,14 @@ "DeformableDetrPreTrainedModel", ] ) + _import_structure["models.grounding_dino"].extend( + [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + ) _import_structure["models.deit"].extend( [ "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4329,6 +4338,7 @@ DecisionTransformerConfig, ) from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig from .models.deprecated.mctct import ( MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -5435,6 +5445,12 @@ DeformableDetrModel, DeformableDetrPreTrainedModel, ) + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) from .models.deit import ( DEIT_PRETRAINED_MODEL_ARCHIVE_LIST, DeiTForImageClassification, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 8f60447e7319f9..376f9353608e56 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -59,6 +59,7 @@ deberta_v2, decision_transformer, deformable_detr, + grounding_dino, deit, deprecated, deta, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index a345235951d48c..db5e5f86761b88 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -71,6 +71,7 @@ ("deberta-v2", "DebertaV2Config"), ("decision_transformer", "DecisionTransformerConfig"), ("deformable_detr", "DeformableDetrConfig"), + ("grounding-dino", "GroundingDINOConfig"), ("deit", "DeiTConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), @@ -277,6 +278,7 @@ ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -474,6 +476,7 @@ ("deberta-v2", "DeBERTa-v2"), ("decision_transformer", "Decision Transformer"), ("deformable_detr", "Deformable DETR"), + ("grounding-dino", "Grounding DINO"), ("deit", "DeiT"), ("deplot", "DePlot"), ("deta", "DETA"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 90ece37c657191..78a0686c4816b0 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -50,6 +50,7 @@ ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("data2vec-vision", "BeitFeatureExtractor"), ("deformable_detr", "DeformableDetrFeatureExtractor"), + ("grounding-dino", "GroundingDINOFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), ("dinat", "ViTFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 37ccc829de1ba5..ec8bf20938fd7a 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -53,6 +53,7 @@ ("cvt", "ConvNextImageProcessor"), ("data2vec-vision", "BeitImageProcessor"), ("deformable_detr", "DeformableDetrImageProcessor"), + ("grounding-dino", "GroundingDINOImageProcessor"), ("deit", "DeiTImageProcessor"), ("deta", "DetaImageProcessor"), ("detr", "DetrImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 8be38bb3f8d577..2c54349e8306b2 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -69,6 +69,7 @@ ("deberta-v2", "DebertaV2Model"), ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), + ("grounding-dino", "GroundingDINOModel"), ("deit", "DeiTModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), @@ -619,6 +620,7 @@ # Model for Object Detection mapping ("conditional_detr", "ConditionalDetrForObjectDetection"), ("deformable_detr", "DeformableDetrForObjectDetection"), + ("grounding-dino", "GroundingDINOForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py new file mode 100644 index 00000000000000..e3767e017d1023 --- /dev/null +++ b/src/transformers/models/grounding_dino/__init__.py @@ -0,0 +1,57 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_grounding_dino"] = [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py new file mode 100644 index 00000000000000..0b3ae3d74d3475 --- /dev/null +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -0,0 +1,262 @@ +# coding=utf-8 +# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Grounding DINO model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + +GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", +} + + + +class GroundingDINOConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate + a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Grounding DINO + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use + `two_stage_num_proposals` instead. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional + backbone from the timm package. For a list of all available models, see [this + page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `False`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `False`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + + Examples: + + ```python + >>> from transformers import GroundingDINOConfig, GroundingDINOModel + + >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration + >>> configuration = GroundingDINOConfig() + + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> model = GroundingDINOModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "grounding-dino" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + use_timm_backbone=True, + backbone_config=None, + num_channels=3, + num_queries=300, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=1024, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + return_intermediate=True, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + dilation=False, + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=False, + two_stage_num_proposals=300, + with_box_refine=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=False, + **kwargs, + ): + if backbone_config is not None and use_timm_backbone: + raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") + + if not use_timm_backbone: + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_channels = num_channels + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.dilation = dilation + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py new file mode 100644 index 00000000000000..d3cef0366b2bca --- /dev/null +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Grounding DINO checkpoints.""" + + +import argparse +import json +from pathlib import Path + +import requests +import torch +from huggingface_hub import cached_download, hf_hub_url +from PIL import Image + +from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def rename_key(orig_key): + if "backbone.0.body" in orig_key: + orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") + if "transformer" in orig_key: + orig_key = orig_key.replace("transformer.", "") + if "norm1" in orig_key: + if "encoder" in orig_key: + orig_key = orig_key.replace("norm1", "self_attn_layer_norm") + else: + orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") + if "norm2" in orig_key: + if "encoder" in orig_key: + orig_key = orig_key.replace("norm2", "final_layer_norm") + else: + orig_key = orig_key.replace("norm2", "self_attn_layer_norm") + if "norm3" in orig_key: + orig_key = orig_key.replace("norm3", "final_layer_norm") + if "linear1" in orig_key: + orig_key = orig_key.replace("linear1", "fc1") + if "linear2" in orig_key: + orig_key = orig_key.replace("linear2", "fc2") + if "query_embed" in orig_key: + orig_key = orig_key.replace("query_embed", "query_position_embeddings") + if "cross_attn" in orig_key: + orig_key = orig_key.replace("cross_attn", "encoder_attn") + + return orig_key + + +def read_in_q_k_v(state_dict): + # transformer decoder self-attention layers + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_grounding_dino_checkpoint( + checkpoint_path, + single_scale, + dilation, + with_box_refine, + two_stage, + pytorch_dump_folder_path, + push_to_hub, +): + """ + Copy/paste/tweak model's weights to our Grounding DINO structure. + """ + + # load default config + config = GroundingDINOConfig() + # set config attributes + if single_scale: + config.num_feature_levels = 1 + config.dilation = dilation + config.with_box_refine = with_box_refine + config.two_stage = two_stage + # set labels + config.num_labels = 91 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + # load image processor + image_processor = DeformableDetrImageProcessor(format="coco_detection") + + # prepare image + img = prepare_img() + encoding = image_processor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + logger.info("Converting model...") + + # load original state dict + state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + # rename keys + for key in state_dict.copy().keys(): + val = state_dict.pop(key) + state_dict[rename_key(key)] = val + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "model." + for key in state_dict.copy().keys(): + if not key.startswith("class_embed") and not key.startswith("bbox_embed"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # finally, create HuggingFace model and load state dict + model = GroundingDINOForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + + device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(device) + # verify our conversion + outputs = model(pixel_values.to(device)) + + expected_logits = torch.tensor( + [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] + ) + expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) + + if single_scale: + expected_logits = torch.tensor( + [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + ) + expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) + + if single_scale and dilation: + expected_logits = torch.tensor( + [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] + ) + expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) + + if with_box_refine: + expected_logits = torch.tensor( + [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] + ) + expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) + + if with_box_refine and two_stage: + expected_logits = torch.tensor( + [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] + ) + expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) + + print("Logits:", outputs.logits[0, :3, :3]) + + assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) + + print("Everything ok!") + + # Save model and image processor + logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + image_processor.save_pretrained(pytorch_dump_folder_path) + + # Push to hub + if push_to_hub: + model_name = "deformable-detr" + model_name += "-single-scale" if single_scale else "" + model_name += "-dc5" if dilation else "" + model_name += "-with-box-refine" if with_box_refine else "" + model_name += "-two-stage" if two_stage else "" + print("Pushing model to hub...") + model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_path", + type=str, + default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth", + help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", + ) + parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") + parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") + parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") + parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the folder to output PyTorch model.", + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + args = parser.parse_args() + convert_grounding_dino_checkpoint( + args.checkpoint_path, + args.single_scale, + args.dilation, + args.with_box_refine, + args.two_stage, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py new file mode 100644 index 00000000000000..97b8f09fb5f446 --- /dev/null +++ b/src/transformers/models/grounding_dino/load_custom.py @@ -0,0 +1,49 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Loading of Grounding DINO's CUDA kernels""" +import os +from pathlib import Path + + +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + import MultiScaleDeformableAttention as MSDA + + return MSDA diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py new file mode 100644 index 00000000000000..ee80a562e4b851 --- /dev/null +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -0,0 +1,2513 @@ +# coding=utf-8 +# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Grounding DINO model.""" + + +import copy +import math +import warnings +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_timm_available, + is_torch_cuda_available, + is_vision_available, + replace_return_docstrings, + requires_backends, +) +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import is_ninja_available, logging +from ..auto import AutoBackbone +from .configuration_grounding_dino import GroundingDINOConfig +from .load_custom import load_cuda_kernels + + +logger = logging.get_logger(__name__) + +# Move this to not compile only when importing, this needs to happen later, like in __init__. +if is_torch_cuda_available() and is_ninja_available(): + logger.info("Loading custom CUDA kernels...") + try: + MultiScaleDeformableAttention = load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + MultiScaleDeformableAttention = None +else: + MultiScaleDeformableAttention = None + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + + +class MultiScaleDeformableAttentionFunction(Function): + @staticmethod + def forward( + context, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + context.im2col_step = im2col_step + output = MultiScaleDeformableAttention.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + context.im2col_step, + ) + context.save_for_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + ) + return output + + @staticmethod + @once_differentiable + def backward(context, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = context.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + context.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_timm_available(): + from timm import create_model + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "GroundingDINOConfig" +_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny" + +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "idea-research/grg-dino-tiny", + # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino +] + + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO +class GroundingDINODecoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDINODecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOModelOutput(ModelOutput): + """ + Base class for outputs of the Grounding DINO encoder-decoder model. + + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + init_reference_points: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO +class GroundingDINOObjectDetectionOutput(ModelOutput): + """ + Output type of [`GroundingDINOForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional = None + enc_outputs_coord_logits: Optional = None + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO +class GroundingDINOFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = GroundingDINOFrozenBatchNorm2d(module.num_features) + + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO +class GroundingDINOConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + kwargs = {} + if config.dilation: + kwargs["output_stride"] = 16 + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,), + in_chans=config.num_channels, + **kwargs, + ) + else: + backbone = AutoBackbone.from_config(config.backbone_config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO +class GroundingDINOConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +# Copied from transformers.models.detr.modeling_detr._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None): + """ + Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`. + """ + batch_size, source_len = mask.size() + target_len = target_len if target_len is not None else source_len + + expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO +class GroundingDINOSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding +class GroundingDINOLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +def multi_scale_deformable_attention( + value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Grounding DINO. + """ + + def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): + super().__init__() + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + self._reset_parameters() + + def _reset_parameters(self): + nn.init.constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.n_heads, 1, 1, 2) + .repeat(1, self.n_levels, self.n_points, 1) + ) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(self.attention_weights.weight.data, 0.0) + nn.init.constant_(self.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(self.value_proj.weight.data) + nn.init.constant_(self.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(self.output_proj.weight.data) + nn.init.constant_(self.output_proj.bias.data, 0.0) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + if self.disable_custom_kernels: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + + return output, attention_weights + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO +class GroundingDINOEncoderLayer(nn.Module): + def __init__(self, config: GroundingDINOConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = GroundingDINOMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO +class GroundingDINODecoderLayer(nn.Module): + def __init__(self, config: GroundingDINOConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = GroundingDINOMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = GroundingDINOMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead +class GroundingDINOClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO +class GroundingDINOPreTrainedModel(PreTrainedModel): + config_class = GroundingDINOConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, GroundingDINOLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): + module._reset_parameters() + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GroundingDINODecoder): + module.gradient_checkpointing = value + + +GROUNDING_DINO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GroundingDINOConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GROUNDING_DINO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`] + for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO +class GroundingDINOEncoder(GroundingDINOPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`GroundingDINOEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: GroundingDINOConfig + """ + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINODecoder(GroundingDINOPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Grounding DINO: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: GroundingDINOConfig + """ + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Grounding DINO + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + else: + if reference_points.shape[-1] != 2: + raise ValueError("Reference points' last dimension must be of size 2") + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + encoder_hidden_states, + encoder_attention_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + if reference_points.shape[-1] == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + if reference_points.shape[-1] != 2: + raise ValueError( + f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}" + ) + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return GroundingDINODecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + GROUNDING_DINO_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO +class GroundingDINOModel(GroundingDINOPreTrainedModel): + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = GroundingDINOConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = GroundingDINOEncoder(config) + self.decoder = GroundingDINODecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_heigth = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GroundingDINOModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation for two-stage Grounding DINO + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return GroundingDINOModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +@add_start_docstrings( + """ + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + GROUNDING_DINO_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO +class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + # Grounding DINO encoder-decoder model + self.model = GroundingDINOModel(config) + + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = GroundingDINOMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + for box_embed in self.bbox_embed: + nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + # Initialize weights and apply final processing + self.post_init() + + # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = GroundingDINOHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = GroundingDINOLoss( + matcher=matcher, + num_classes=self.config.num_labels, + focal_alpha=self.config.focal_alpha, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + if self.config.two_stage: + enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid() + outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord} + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = GroundingDINOObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs + + +# Copied from transformers.models.detr.modeling_detr.dice_loss +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO +class GroundingDINOLoss(nn.Module): + """ + This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we + compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of + matched ground-truth / prediction (supervise class and box). + + Args: + matcher (`GroundingDINOHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + focal_alpha (`float`): + Alpha parameter in focal loss. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__(self, matcher, num_classes, focal_alpha, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # (Niels): comment out function below, distributed training to be added + # if is_dist_avail_and_initialized(): + # torch.distributed.all_reduce(num_boxes) + # (Niels) in original implementation, num_boxes is divided by get_world_size() + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + if "enc_outputs" in outputs: + enc_outputs = outputs["enc_outputs"] + bin_targets = copy.deepcopy(targets) + for bt in bin_targets: + bt["class_labels"] = torch.zeros_like(bt["class_labels"]) + indices = self.matcher(enc_outputs, bin_targets) + for loss in self.losses: + l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) + l_dict = {k + "_enc": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class GroundingDINOMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO +class GroundingDINOHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# Copied from transformers.models.detr.modeling_detr._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +# Copied from transformers.models.detr.modeling_detr.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# Copied from transformers.models.detr.modeling_detr.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# Copied from transformers.models.detr.modeling_detr._max_by_axis +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +# Copied from transformers.models.detr.modeling_detr.NestedTensor +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) diff --git a/tests/models/grounding_dino/__init__.py b/tests/models/grounding_dino/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py new file mode 100644 index 00000000000000..3007eef6399916 --- /dev/null +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -0,0 +1,673 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Grounding DINO model. """ + + +import inspect +import math +import unittest +from typing import Dict, List, Tuple + +from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.file_utils import cached_property +from transformers.testing_utils import ( + require_timm, + require_torch, + require_torch_gpu, + require_vision, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GroundingDINOForObjectDetection, GroundingDINOModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class GroundingDINOModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + image_size=196, + n_targets=8, + num_labels=91, + num_feature_levels=4, + encoder_n_points=2, + decoder_n_points=6, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.image_size = image_size + self.n_targets = n_targets + self.num_labels = num_labels + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = ( + math.ceil(self.image_size / 8) ** 2 + + math.ceil(self.image_size / 16) ** 2 + + math.ceil(self.image_size / 32) ** 2 + + math.ceil(self.image_size / 64) ** 2 + ) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return GroundingDINOConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + num_feature_levels=self.num_feature_levels, + encoder_n_points=self.encoder_n_points, + decoder_n_points=self.decoder_n_points, + use_timm_backbone=False, + backbone_config=resnet_config, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels): + model = GroundingDINOModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) + + def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = GroundingDINOForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else () + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GroundingDINOForObjectDetection": + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.image_size, + self.model_tester.image_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = GroundingDINOModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False) + + def test_config(self): + # we don't test common_properties and arguments_init as these don't apply for Grounding DINO + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + + def test_grounding_dino_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs) + + def test_grounding_dino_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="Grounding DINO does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="Grounding DINO is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="Grounding DINO does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + out_len = len(outputs) + + correct_outlen = 8 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "GroundingDINOForObjectDetection": + correct_outlen += 2 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.decoder_n_points, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + # we take the second output since last_hidden_state is the second item + output = outputs[1] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "GroundingDINOForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if param.requires_grad: + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_two_stage_training(self): + model_class = GroundingDINOForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + config.two_stage = True + config.auxiliary_loss = True + config.with_box_refine = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class GroundingDINOModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None + + def test_inference_object_detection_head(self): + model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device) + expected_labels = [17, 17, 75, 75, 63] + expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4)) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes)) + + def test_inference_object_detection_head_with_box_refine_two_stage(self): + model = GroundingDINOForObjectDetection.from_pretrained( + "SenseTime/deformable-detr-with-box-refine-two-stage" + ).to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) + + @require_torch_gpu + def test_inference_object_detection_head_equivalence_cpu_gpu(self): + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt") + pixel_values = encoding["pixel_values"] + pixel_mask = encoding["pixel_mask"] + + # 1. run model on CPU + model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale") + + with torch.no_grad(): + cpu_outputs = model(pixel_values, pixel_mask) + + # 2. run model on GPU + model.to("cuda") + + with torch.no_grad(): + gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda")) + + # 3. assert equivalence + for key in cpu_outputs.keys(): + assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4) + + expected_logits = torch.tensor( + [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + ) + assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4) From f6d87c104d941fb2f1b09960407623dd6fc45d04 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 23 Aug 2023 12:25:43 -0300 Subject: [PATCH 002/252] First commit --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/index.md | 2 + docs/source/en/tasks/object_detection.md | 2 +- .../configuration_grounding_dino.py | 6 +- .../convert_grounding_dino_to_hf.py | 242 ++++++++++++++++++ .../convert_grounding_dino_to_pytorch.py | 237 ----------------- .../grounding_dino/modeling_grounding_dino.py | 4 +- src/transformers/utils/dummy_pt_objects.py | 24 ++ 14 files changed, 281 insertions(+), 243 deletions(-) create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py diff --git a/README.md b/README.md index 41fb758abe1500..d952ba96ddfa62 100644 --- a/README.md +++ b/README.md @@ -371,6 +371,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. diff --git a/README_es.md b/README_es.md index 6a0701b09d3432..2f38327dcb84ca 100644 --- a/README_es.md +++ b/README_es.md @@ -348,6 +348,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. diff --git a/README_hd.md b/README_hd.md index 8651678669a7e7..01dcffadaef0db 100644 --- a/README_hd.md +++ b/README_hd.md @@ -320,6 +320,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। diff --git a/README_ja.md b/README_ja.md index 8e497e94175a0a..c88e62e459d215 100644 --- a/README_ja.md +++ b/README_ja.md @@ -382,6 +382,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました. 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) diff --git a/README_ko.md b/README_ko.md index 3f33e4b199d367..885494ef0a1abd 100644 --- a/README_ko.md +++ b/README_ko.md @@ -297,6 +297,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 135f9b35a9631f..c9d78e7d9887a3 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -321,6 +321,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 781c412ca2a1db..4c8ef2c098aa03 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -333,6 +333,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index e1c346971f386e..b6738dcf3a9b04 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -137,6 +137,7 @@ The documentation is organized into five sections: 1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. @@ -359,6 +360,7 @@ Flax), PyTorch, and/or TensorFlow. | GPTBigCode | ✅ | ❌ | ❌ | | GPTSAN-japanese | ✅ | ❌ | ❌ | | Graphormer | ✅ | ❌ | ❌ | +| Grounding DINO | ✅ | ❌ | ❌ | | GroupViT | ✅ | ✅ | ❌ | | Hubert | ✅ | ✅ | ❌ | | I-BERT | ✅ | ❌ | ❌ | diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 563beb274253d5..4eab9e58fb27da 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0b3ae3d74d3475..23cd86fd3f9d44 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -22,7 +22,7 @@ logger = logging.get_logger(__name__) GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", + "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } @@ -151,8 +151,8 @@ class GroundingDINOConfig(PretrainedConfig): def __init__( self, - use_timm_backbone=True, - backbone_config=None, + use_timm_backbone=False, + backbone_config={"model_type": "swin"}, num_channels=3, num_queries=300, max_position_embeddings=1024, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py new file mode 100644 index 00000000000000..b5de1d8a652c0e --- /dev/null +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -0,0 +1,242 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert GroundingDINO SimMIM checkpoints from the original repository. + +URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" + +import argparse + +import requests +import torch +from PIL import Image +from torchvision import transforms as T +import torchvision.transforms.functional as F + +from transformers import ( + GroundingDINOConfig, GroundingDINOForObjectDetection +) + +IMAGENET_MEAN = [0.485, 0.456, 0.406] +IMAGENET_STD = [0.229, 0.224, 0.225] + + +def get_grounding_dino_config(model_name): + config = GroundingDINOConfig() + + if "tiny" in model_name: + window_size = 7 + embed_dim = 96 + depths = (2, 2, 6, 2) + num_heads = (3, 6, 12, 24) + image_size = 224 + elif "base" in model_name: + window_size = 12 + embed_dim = 128 + depths = (2, 2, 18, 2) + num_heads = (4, 8, 16, 32) + image_size = 384 + else: + raise ValueError("Model not supported, only supports base and large variants") + + config.backbone_config.window_size = window_size + config.backbone_config.image_size = image_size + config.backbone_config.embed_dim = embed_dim + config.backbone_config.depths = depths + config.backbone_config.num_heads = num_heads + config.backbone_config.out_indices = [2, 3, 4] + + return config + + +def create_rename_keys(config): + rename_keys = [] + # fmt: off + #TODO names might change after modifing GroundingDINOModel class + ########################################## VISION BACKBONE - START + # patch embedding layer + rename_keys.append(("module.backbone.0.patch_embed.proj.weight", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("module.backbone.0.patch_embed.proj.bias", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) + rename_keys.append(("module.backbone.0.patch_embed.norm.weight", + "model.backbone.conv_encoder.model.embeddings.norm.weight")) + rename_keys.append(("module.backbone.0.patch_embed.norm.bias", + "model.backbone.conv_encoder.model.embeddings.norm.bias")) + + for layer, depth in enumerate(config.backbone_config.depths): + for block in range(depth): + # layernorms + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) + + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) + # attention + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) + # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) + # intermidiate + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) + + # output + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) + + # downsample + if layer!=len(config.backbone_config.depths)-1: + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) + + for out_indice in config.backbone_config.out_indices: + # Grounding DINO implementation of out_indices isn't aligned with transformers + rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) + rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) + + ########################################## VISION BACKBONE - END + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + ########################################## VISION BACKBONE - START + embed_dim = config.backbone_config.embed_dim + for layer, depth in enumerate(config.backbone_config.depths): + hidden_size = embed_dim * 2**layer + for block in range(depth): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] + + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :] + ########################################## VISION BACKBONE - END + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + +@torch.no_grad() +def convert_grounding_dino_checkpoint(model_name, checkpoint_path): + #Define default GroundingDINO configuation + config = get_grounding_dino_config(model_name) + + # Load original checkpoint + original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + + # Rename keys + new_state_dict = original_state_dict.copy() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(new_state_dict, src, dest) + read_in_q_k_v(new_state_dict, config) + + # Load HF implementation with default config and converted state dict + model = GroundingDINOForObjectDetection(config).eval() + model.load_state_dict(new_state_dict, strict=False) + + # Load and process test image + image = prepare_img() + image_processor = T.Compose( + [ + T.Resize(size=800, max_size=1333), + T.ToTensor(), + T.Normalize(IMAGENET_MEAN, IMAGENET_STD) + ] + ) + inputs = image_processor(image) + pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) + output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + for feature_map in output.feature_maps: + print(f"{feature_map.shape}") + print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") + + # outputs = model(**inputs).logits + + # print(outputs.keys()) + # print("Looks ok!") + + # if pytorch_dump_folder_path is not None: + # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + # model.save_pretrained(pytorch_dump_folder_path) + + # print(f"Saving image processor to {pytorch_dump_folder_path}") + # image_processor.save_pretrained(pytorch_dump_folder_path) + + # if push_to_hub: + # print(f"Pushing model and image processor for {model_name} to hub") + # model.push_to_hub(f"microsoft/{model_name}") + # image_processor.push_to_hub(f"microsoft/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="grounding-dino-tiny", + type=str, + choices=["grounding-dino-tiny", "grounding-dino-base"], + help="Name of the GroundingDINO model you'd like to convert.", + ) + parser.add_argument( + "--checkpoint_path", + default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth", + type=str, + help="Path to the original PyTorch checkpoint (.pth file).", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path) \ No newline at end of file diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py deleted file mode 100644 index d3cef0366b2bca..00000000000000 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py +++ /dev/null @@ -1,237 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Grounding DINO checkpoints.""" - - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import cached_download, hf_hub_url -from PIL import Image - -from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_key(orig_key): - if "backbone.0.body" in orig_key: - orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") - if "transformer" in orig_key: - orig_key = orig_key.replace("transformer.", "") - if "norm1" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm1", "self_attn_layer_norm") - else: - orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") - if "norm2" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm2", "final_layer_norm") - else: - orig_key = orig_key.replace("norm2", "self_attn_layer_norm") - if "norm3" in orig_key: - orig_key = orig_key.replace("norm3", "final_layer_norm") - if "linear1" in orig_key: - orig_key = orig_key.replace("linear1", "fc1") - if "linear2" in orig_key: - orig_key = orig_key.replace("linear2", "fc2") - if "query_embed" in orig_key: - orig_key = orig_key.replace("query_embed", "query_position_embeddings") - if "cross_attn" in orig_key: - orig_key = orig_key.replace("cross_attn", "encoder_attn") - - return orig_key - - -def read_in_q_k_v(state_dict): - # transformer decoder self-attention layers - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_grounding_dino_checkpoint( - checkpoint_path, - single_scale, - dilation, - with_box_refine, - two_stage, - pytorch_dump_folder_path, - push_to_hub, -): - """ - Copy/paste/tweak model's weights to our Grounding DINO structure. - """ - - # load default config - config = GroundingDINOConfig() - # set config attributes - if single_scale: - config.num_feature_levels = 1 - config.dilation = dilation - config.with_box_refine = with_box_refine - config.two_stage = two_stage - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - image_processor = DeformableDetrImageProcessor(format="coco_detection") - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "model." - for key in state_dict.copy().keys(): - if not key.startswith("class_embed") and not key.startswith("bbox_embed"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = GroundingDINOForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - # verify our conversion - outputs = model(pixel_values.to(device)) - - expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ) - expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) - - if single_scale: - expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] - ) - expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) - - if single_scale and dilation: - expected_logits = torch.tensor( - [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] - ) - expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) - - if with_box_refine: - expected_logits = torch.tensor( - [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] - ) - expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) - - if with_box_refine and two_stage: - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ) - expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) - - print("Logits:", outputs.logits[0, :3, :3]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - - print("Everything ok!") - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - model_name = "deformable-detr" - model_name += "-single-scale" if single_scale else "" - model_name += "-dc5" if dilation else "" - model_name += "-with-box-refine" if with_box_refine else "" - model_name += "-two-stage" if two_stage else "" - print("Pushing model to hub...") - model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - type=str, - default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth", - help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", - ) - parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") - parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") - parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") - parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - convert_grounding_dino_checkpoint( - args.checkpoint_path, - args.single_scale, - args.dilation, - args.with_box_refine, - args.two_stage, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index ee80a562e4b851..603bdfdd8e8126 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -122,10 +122,10 @@ def backward(context, grad_output): logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "GroundingDINOConfig" -_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny" +_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny" GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "idea-research/grg-dino-tiny", + "idea-research/grounding-dino-tiny", # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino ] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 5724e689f2fce2..f0bc1e774383b5 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2340,6 +2340,30 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GroundingDINOForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None From f2052b0f44d2157de631adfdd0ccfb53ba7ff7bf Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 27 Aug 2023 01:47:21 -0300 Subject: [PATCH 003/252] Added bert to model --- .../configuration_grounding_dino.py | 7 +- .../convert_grounding_dino_to_hf.py | 13 +- .../grounding_dino/modeling_grounding_dino.py | 686 +++++++++++++++++- 3 files changed, 692 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 23cd86fd3f9d44..9025d01e725561 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -16,7 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -44,6 +44,8 @@ class GroundingDINOConfig(PretrainedConfig): backbone_config (`PretrainedConfig` or `dict`, *optional*): The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which case it will default to `ResNetConfig()`. + text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`): + The configuration of the text backbone model. Should be a bert-like config. num_channels (`int`, *optional*, defaults to 3): The number of input channels. num_queries (`int`, *optional*, defaults to 300): @@ -153,6 +155,7 @@ def __init__( self, use_timm_backbone=False, backbone_config={"model_type": "swin"}, + text_backbone_config="bert-base-uncased", num_channels=3, num_queries=300, max_position_embeddings=1024, @@ -251,6 +254,8 @@ def __init__( self.eos_coefficient = eos_coefficient self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels + # Text backbone + self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index b5de1d8a652c0e..d5b07b32c3f49f 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -60,7 +60,7 @@ def get_grounding_dino_config(model_name): return config -def create_rename_keys(config): +def create_rename_keys(state_dict, config): rename_keys = [] # fmt: off #TODO names might change after modifing GroundingDINOModel class @@ -126,10 +126,14 @@ def create_rename_keys(config): ########################################## VISION BACKBONE - END + ########################################## TEXT BACKBONE - START + for layer_name, params in state_dict.items(): + if "module.bert" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) + ########################################## TEXT BACKBONE - END # fmt: on return rename_keys - def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val @@ -172,7 +176,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Rename keys new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(config) + rename_keys = create_rename_keys(original_state_dict, config) for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) @@ -192,7 +196,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) inputs = image_processor(image) pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) - output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + for feature_map in output.feature_maps: print(f"{feature_map.shape}") print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 603bdfdd8e8126..8bea6eee50096e 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -19,7 +19,7 @@ import math import warnings from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -39,8 +39,13 @@ replace_return_docstrings, requires_backends, ) -from ...modeling_outputs import BaseModelOutput +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPoolingAndCrossAttentions, + BaseModelOutputWithPastAndCrossAttentions +) from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...pytorch_utils import meshgrid from ...utils import is_ninja_available, logging from ..auto import AutoBackbone @@ -173,7 +178,7 @@ class GroundingDINODecoderOutput(ModelOutput): # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOModelOutput(ModelOutput): """ - Base class for outputs of the Grounding DINO encoder-decoder model. + Base class for outputs of the Deformable DETR encoder-decoder model. Args: init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): @@ -432,6 +437,7 @@ def __init__(self, config): if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: parameter.requires_grad_(False) + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): # send pixel_values through the model to get list of feature maps features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps @@ -600,7 +606,7 @@ def multi_scale_deformable_attention( # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOMultiscaleDeformableAttention(nn.Module): """ - Multiscale deformable attention as proposed in Grounding DINO. + Multiscale deformable attention as proposed in Deformable DETR. """ def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): @@ -736,7 +742,7 @@ class GroundingDINOMultiheadAttention(nn.Module): """ Multi-headed attention from 'Attention Is All You Need' paper. - Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper). + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). """ def __init__( @@ -1294,7 +1300,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel): The decoder updates the query embeddings through multiple self-attention and cross-attention layers. - Some tweaks for Grounding DINO: + Some tweaks for Deformable DETR: - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. - it also returns a stack of intermediate outputs and reference points from all decoding layers. @@ -1310,7 +1316,7 @@ def __init__(self, config: GroundingDINOConfig): self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.gradient_checkpointing = False - # hack implementation for iterative bounding box refinement and two-stage Grounding DINO + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR self.bbox_embed = None self.class_embed = None @@ -1493,6 +1499,8 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + # Create Text Extractor + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -1772,7 +1780,7 @@ def forward( encoder_outputs[0], ~mask_flatten, spatial_shapes ) - # hack implementation for two-stage Grounding DINO + # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) @@ -1850,7 +1858,7 @@ class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): def __init__(self, config: GroundingDINOConfig): super().__init__(config) - # Grounding DINO encoder-decoder model + # Deformable DETR encoder-decoder model self.model = GroundingDINOModel(config) # Detection heads on top @@ -2178,6 +2186,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes): return losses @torch.no_grad() + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality def loss_cardinality(self, outputs, targets, indices, num_boxes): """ Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. @@ -2193,6 +2202,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes): losses = {"cardinality_error": card_err} return losses + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes def loss_boxes(self, outputs, targets, indices, num_boxes): """ Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. @@ -2217,12 +2227,14 @@ def loss_boxes(self, outputs, targets, indices, num_boxes): losses["loss_giou"] = loss_giou.sum() / num_boxes return losses + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx def _get_source_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) source_idx = torch.cat([source for (source, _) in indices]) return batch_idx, source_idx + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx def _get_target_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) @@ -2511,3 +2523,659 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): else: raise ValueError("Only 3-dimensional tensors are supported") return NestedTensor(tensor, mask) + +# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText +class GroundingDINOTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText +class GroundingDINOTextSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText +class GroundingDINOTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText +class GroundingDINOTextAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = GroundingDINOTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText +class GroundingDINOTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText +class GroundingDINOTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText +class GroundingDINOTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GroundingDINOTextAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute") + self.intermediate = GroundingDINOTextIntermediate(config) + self.output = GroundingDINOTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText +class GroundingDINOTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText +class GroundingDINOTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + +# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText +class GroundingDINOTextModel(nn.Module): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__() + self.config = config + + self.embeddings = GroundingDINOTextEmbeddings(config) + self.encoder = GroundingDINOTextEncoder(config) + + self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) From 632f8a6fb043ddbccf1bfbb9582c9f0b9f583b38 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 29 Aug 2023 23:30:53 -0300 Subject: [PATCH 004/252] Bert validated --- .../configuration_grounding_dino.py | 7 +- .../convert_grounding_dino_to_hf.py | 105 ++++++++++++++++-- .../grounding_dino/modeling_grounding_dino.py | 5 +- 3 files changed, 106 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 9025d01e725561..0b4df30f6ee46f 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -177,7 +177,7 @@ def __init__( return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", - backbone="resnet50", + backbone="swin", use_pretrained_backbone=True, dilation=False, num_feature_levels=4, @@ -196,6 +196,9 @@ def __init__( eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, + #other parameters + max_text_len = 256, + sub_sentence_present = True, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -256,6 +259,8 @@ def __init__( self.disable_custom_kernels = disable_custom_kernels # Text backbone self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) + self.max_text_len = max_text_len + self.sub_sentence_present = sub_sentence_present super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index d5b07b32c3f49f..d5ebc9281b8733 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -25,7 +25,7 @@ import torchvision.transforms.functional as F from transformers import ( - GroundingDINOConfig, GroundingDINOForObjectDetection + GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer ) IMAGENET_MEAN = [0.485, 0.456, 0.406] @@ -166,6 +166,88 @@ def prepare_img(): image = Image.open(requests.get(url, stream=True).raw).convert("RGB") return image +def text_processor(text: str, config): + def preprocess_caption(caption: str) -> str: + result = caption.lower().strip() + if result.endswith("."): + return result + return result + "." + def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + ) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + cate_to_token_mask_list = [[] for _ in range(bs)] + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() + c2t_maski[previous_col + 1 : col] = True + cate_to_token_mask_list[row].append(c2t_maski) + previous_col = col + + cate_to_token_mask_list = [ + torch.stack(cate_to_token_mask_listi, dim=0) + for cate_to_token_mask_listi in cate_to_token_mask_list + ] + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.to(torch.long) + tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) + special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + text = preprocess_caption(text) + tokenized = tokenizer([text], padding="longest", return_tensors="pt") + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( + tokenized, special_tokens) + + max_text_len = config.max_text_len + sub_sentence_present = config.sub_sentence_present + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[ + :, : max_text_len, : max_text_len + ] + position_ids = position_ids[:, : max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len] + + # extract text embeddings + if sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + + return tokenized_for_encoder + @torch.no_grad() def convert_grounding_dino_checkpoint(model_name, checkpoint_path): #Define default GroundingDINO configuation @@ -187,6 +269,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Load and process test image image = prepare_img() + text = "a cat" image_processor = T.Compose( [ T.Resize(size=800, max_size=1333), @@ -194,13 +277,21 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): T.Normalize(IMAGENET_MEAN, IMAGENET_STD) ] ) - inputs = image_processor(image) - pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) - output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + image_inputs = image_processor(image) + text_inputs = text_processor(text, config) + + pixel_mask = torch.ones( + ((1, image_inputs.shape[1], image_inputs.shape[2])), + dtype=torch.long, + device=image_inputs.device + ) + # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0)) + output = model.model.text_backbone(**text_inputs) + print(output.last_hidden_state[:, :, :5]) - for feature_map in output.feature_maps: - print(f"{feature_map.shape}") - print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") + # for feature_map in output.last_hidden_state: + # print(f"{feature_map.shape}") + # print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") # outputs = model(**inputs).logits diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8bea6eee50096e..ebe151de480211 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -3014,7 +3014,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output # Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText -class GroundingDINOTextModel(nn.Module): +class GroundingDINOTextModel(PreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of @@ -3028,8 +3028,7 @@ class GroundingDINOTextModel(nn.Module): """ def __init__(self, config, add_pooling_layer=True): - super().__init__() - self.config = config + super().__init__(config) self.embeddings = GroundingDINOTextEmbeddings(config) self.encoder = GroundingDINOTextEncoder(config) From e04de0ec32d94e9cfe01b7f631a4f74e54287a5d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 31 Aug 2023 20:03:28 -0300 Subject: [PATCH 005/252] Created Text and Fusion layers for Encoder --- .../configuration_grounding_dino.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 309 +++++++++++++++++- 2 files changed, 306 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0b4df30f6ee46f..e77d4be247b746 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -160,7 +160,7 @@ def __init__( num_queries=300, max_position_embeddings=1024, encoder_layers=6, - encoder_ffn_dim=1024, + encoder_ffn_dim=2048, encoder_attention_heads=8, decoder_layers=6, decoder_ffn_dim=1024, diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index ebe151de480211..731172570c23d2 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -854,6 +854,304 @@ def forward( return attn_output, attn_weights_reshaped +# Repeting some code to avoid convert nn.MultiheadAttention later +class GroundingDINOEncoderTextLayer(nn.Module): + def __init__( + self, + embed_dim, + num_heads, + ffn_dim: int, + dropout: float = 0.0, + bias: bool = True, + activation: str = 'relu' + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) + # Implementation of Feedforward model + self.fc1 = nn.Linear(embed_dim, ffn_dim) + self.dropout = nn.Dropout(dropout) + self.fc2 = nn.Linear(ffn_dim, embed_dim) + + self.layer_norm_before = nn.LayerNorm(embed_dim) + self.layer_norm_after = nn.LayerNorm(embed_dim) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = ACT2FN[activation] + self.num_heads = num_heads + + def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): + return hidden_state if position_embeddings is None else hidden_state + position_embeddings + + def forward( + self, + hidden_states: Tensor, + attention_masks: Optional[Tensor] = None, + position_embeddings: Optional[Tensor] = None, + ): # repeat attn mask + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]: + # bs, num_q, num_k + attention_masks = attention_masks.repeat(self.num_heads, 1, 1) + + q = k = self.with_pos_embed(hidden_states, position_embeddings) + attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0] + + hidden_states = hidden_states + self.dropout1(attention_output) + hidden_states = self.layer_norm_before(hidden_states) + hidden_states = self.activation(self.fc1(hidden_states)) + attention_output = self.fc2(self.dropout(hidden_states)) + hidden_states = hidden_states + self.dropout2(attention_output) + hidden_states = self.layer_norm_after(hidden_states) + return hidden_states + +class BiMultiHeadAttention(nn.Module): + def __init__( + self, + vision_dim: int, + text_dim: int, + embed_dim: int, + num_heads: int, + dropout:float = 0.1 + ): + super().__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.vision_dim = vision_dim + self.text_dim = text_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.text_proj = nn.Linear(self.text_dim, self.embed_dim) + self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim) + + self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) + self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.vision_proj.weight) + self.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.text_proj.weight) + self.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_vision_proj.weight) + self.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_text_proj.weight) + self.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_vision_proj.weight) + self.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_text_proj.weight) + self.out_text_proj.bias.data.fill_(0) + + def forward( + self, + vision_features: Tensor, + text_features: Tensor, + vision_attention_mask: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None + ): + """_summary_ + + Args: + vision_features Tensor: bs, n_img, dim + text_features Tensor: bs, n_text, dim + vision_attention_mask (Tensor, optional): _description_. bs, n_img + text_attention_mask (Tensor, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + bsz, tgt_len, _ = vision_features.size() + + vision_query_states = self.vision_proj(vision_features) * self.scale + vision_query_states = self._shape(vision_query_states, tgt_len, bsz) + + text_key_states = self.text_proj(text_features) + text_key_states = self._shape(text_key_states, -1, bsz) + + vision_value_states = self.values_vision_proj(vision_features) + vision_value_states = self._shape(vision_value_states, -1, bsz) + + text_value_states = self.values_text_proj(text_features) + text_value_states = self._shape(text_value_states, -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + + vision_query_states = vision_query_states.view(*proj_shape) + text_key_states = text_key_states.view(*proj_shape) + vision_value_states = vision_value_states.view(*proj_shape) + text_value_states = text_value_states.view(*proj_shape) + + src_len = text_key_states.size(1) + attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + attn_weights = attn_weights - attn_weights.max() + + attn_weights = torch.clamp( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + attn_weights = torch.clamp( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + + text_attn_weights = torch.clamp( + text_attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + text_attn_weights = torch.clamp( + text_attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if vision_attention_mask is not None: + vision_attention_mask = ( + vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + text_attn_weights.masked_fill_(vision_attention_mask, float("-inf")) + + text_attn_weights = text_attn_weights.softmax(dim=-1) + + # mask language for vision + if text_attention_mask is not None: + text_attention_mask = ( + text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights.masked_fill_(text_attention_mask, float("-inf")) + vision_attn_weights = attn_weights.softmax(dim=-1) + + vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training) + text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training) + + vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) + text_attn_output = torch.bmm(text_attn_probs, vision_value_states) + + if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + ) + + if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + ) + + vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.transpose(1, 2) + vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim) + + text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.transpose(1, 2) + text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim) + + vision_attn_output = self.out_vision_proj(vision_attn_output) + text_attn_output = self.out_text_proj(text_attn_output) + + return vision_attn_output, text_attn_output + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO +class GroundingDINODropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + +class GroundingDINOBiAttention(nn.Module): + def __init__( + self, + vision_dim, + text_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super().__init__() + + # pre layer norm + self.layer_norm_vision = nn.LayerNorm(vision_dim) + self.layer_norm_text = nn.LayerNorm(text_dim) + self.attn = BiMultiHeadAttention( + vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True) + + def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): + vision_features = self.layer_norm_vision(vision_features) + text_features = self.layer_norm_text(text_features) + delta_v, delta_l = self.attn( + vision_features, + text_features, + attention_mask_vision=attention_mask_vision, + attention_mask_text=attention_mask_text + ) + # vision_features, text_features = vision_features + delta_v, text_features + delta_l + vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) + text_features = text_features + self.drop_path(self.gamma_l * delta_l) + return vision_features, text_features # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO class GroundingDINOEncoderLayer(nn.Module): @@ -1499,8 +1797,6 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) - # Create Text Extractor - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -1850,7 +2146,6 @@ def forward( """, GROUNDING_DINO_START_DOCSTRING, ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] @@ -1866,6 +2161,7 @@ def __init__(self, config: GroundingDINOConfig): self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) @@ -2588,6 +2884,8 @@ def forward( embeddings = self.dropout(embeddings) return embeddings +# Classes for Text Backbone (It's just a BERT model) + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText class GroundingDINOTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3013,7 +3311,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: pooled_output = self.activation(pooled_output) return pooled_output -# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText class GroundingDINOTextModel(PreTrainedModel): """ @@ -3029,12 +3326,16 @@ class GroundingDINOTextModel(PreTrainedModel): def __init__(self, config, add_pooling_layer=True): super().__init__(config) + self.config = config self.embeddings = GroundingDINOTextEmbeddings(config) self.encoder = GroundingDINOTextEncoder(config) self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None + # Initialize weights and apply final processing + self.post_init() + def get_input_embeddings(self): return self.embeddings.word_embeddings From 619e0962f8a2c0522a308852708c8055b154caf3 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 31 Aug 2023 20:59:26 -0300 Subject: [PATCH 006/252] Adapted Encoder layer --- .../configuration_grounding_dino.py | 8 + .../grounding_dino/modeling_grounding_dino.py | 180 +++++++++++++----- 2 files changed, 137 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e77d4be247b746..3abf4912ebb651 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -199,6 +199,9 @@ def __init__( #other parameters max_text_len = 256, sub_sentence_present = True, + text_enhancer_dropout = 0.0, + fusion_droppath = 0.1, + fusion_dropout = 0.0, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -261,6 +264,11 @@ def __init__( self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) self.max_text_len = max_text_len self.sub_sentence_present = sub_sentence_present + # Text Enhancer + self.text_enhancer_dropout = text_enhancer_dropout + # Fusion + self.fusion_droppath = fusion_droppath + self.fusion_dropout = fusion_dropout super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 731172570c23d2..91129946c6141e 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -855,30 +855,28 @@ def forward( return attn_output, attn_weights_reshaped # Repeting some code to avoid convert nn.MultiheadAttention later -class GroundingDINOEncoderTextLayer(nn.Module): - def __init__( - self, - embed_dim, - num_heads, - ffn_dim: int, - dropout: float = 0.0, - bias: bool = True, - activation: str = 'relu' - ): +#TODO is this an approriate way to name this? +class GroundingDINOTextEnhancerLayer(nn.Module): + """Vanilla Transformer with text embeddings as input""" + def __init__(self, config): super().__init__() - self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) + self.self_attn = nn.MultiheadAttention( + embed_dim=config.d_model, + num_heads=config.num_heads // 2, + dropout=config.text_enhancer_dropout + ) # Implementation of Feedforward model - self.fc1 = nn.Linear(embed_dim, ffn_dim) - self.dropout = nn.Dropout(dropout) - self.fc2 = nn.Linear(ffn_dim, embed_dim) + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) + self.dropout = nn.Dropout(config.text_enhancer_dropout) + self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) - self.layer_norm_before = nn.LayerNorm(embed_dim) - self.layer_norm_after = nn.LayerNorm(embed_dim) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) + self.layer_norm_before = nn.LayerNorm(config.d_model) + self.layer_norm_after = nn.LayerNorm(config.d_model) + self.dropout1 = nn.Dropout(config.text_enhancer_dropout) + self.dropout2 = nn.Dropout(config.text_enhancer_dropout) - self.activation = ACT2FN[activation] - self.num_heads = num_heads + self.activation = ACT2FN[config.activation_fuction] + self.num_heads = config.num_heads // 2 def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -903,8 +901,8 @@ def forward( hidden_states = hidden_states + self.dropout2(attention_output) hidden_states = self.layer_norm_after(hidden_states) return hidden_states - -class BiMultiHeadAttention(nn.Module): + +class GroundingDINOBiMultiHeadAttention(nn.Module): def __init__( self, vision_dim: int, @@ -1106,38 +1104,26 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -class GroundingDINOBiAttention(nn.Module): - def __init__( - self, - vision_dim, - text_dim, - embed_dim, - num_heads, - dropout=0.1, - drop_path=0.0, - init_values=1e-4, - ): - """ - Inputs: - embed_dim - Dimensionality of input and attention feature vectors - hidden_dim - Dimensionality of hidden layer in feed-forward network - (usually 2-4x larger than embed_dim) - num_heads - Number of heads to use in the Multi-Head Attention block - dropout - Amount of dropout to apply in the feed-forward network - """ +class GroundingDINOFusionLayer(nn.Module): + def __init__(self, config, init_values=1e-4): super().__init__() + drop_path = config.fusion_droppath # pre layer norm - self.layer_norm_vision = nn.LayerNorm(vision_dim) - self.layer_norm_text = nn.LayerNorm(text_dim) - self.attn = BiMultiHeadAttention( - vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + self.layer_norm_vision = nn.LayerNorm(config.d_model) + self.layer_norm_text = nn.LayerNorm(config.d_model) + self.attn = GroundingDINOBiMultiHeadAttention( + vision_dim=config.d_model, + text_dim=config.d_model, + embed_dim=config.encoder_ffn_dim // 2, + num_heads=config.num_heads // 2, + dropout=config.fusion_dropout ) # add layer scale for training stability self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True) - self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True) + self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) @@ -1153,8 +1139,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at text_features = text_features + self.drop_path(self.gamma_l * delta_l) return vision_features, text_features -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO -class GroundingDINOEncoderLayer(nn.Module): +#NOTE just renamed the class +class GroundingDINODeformableLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() self.embed_dim = config.d_model @@ -1238,6 +1224,98 @@ def forward( return outputs +def get_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, + ) -> Tensor: + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res + + +class GroundingDINOEncoderLayer(nn.Module): + def __init__(self, config) -> None: + super().__init_() + self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) + self.fusion_layer = GroundingDINOFusionLayer(config) + self.deformable_layer = GroundingDINODeformableLayer(config) + + def forward( + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None + ): + bs, n_text, text_dim = text_features.shape + if text_position_embedding is None and text_position_ids is None: + pos_text = ( + torch.arange(n_text, device=text_features.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + vision_features, text_features = self.fusion_layer( + vision_features=vision_features, + text_features=text_features, + attention_mask_vision=key_padding_mask, + attention_mask_text=text_attention_mask, + ) + + text_features = self.text_enhancer_layer( + hidden_states=text_features.transpose(0, 1), + attention_masks=~text_self_attention_masks, # note we use ~ for mask here + position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None), + ).transpose(0, 1) + + vision_features = self.deformable_layer( + hidden_states=vision_features, + attention_mask=key_padding_mask, + position_embeddings=vision_position_embedding, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + ) + + return vision_features, text_features + # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO class GroundingDINODecoderLayer(nn.Module): @@ -1788,7 +1866,6 @@ def custom_forward(*inputs): """, GROUNDING_DINO_START_DOCSTRING, ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO class GroundingDINOModel(GroundingDINOPreTrainedModel): def __init__(self, config: GroundingDINOConfig): super().__init__(config) @@ -1797,6 +1874,8 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + # Create text backbone + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -2161,7 +2240,6 @@ def __init__(self, config: GroundingDINOConfig): self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) From 52fa847a77d42bea476070ec51a3cb125cf73928 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 1 Sep 2023 11:37:07 -0300 Subject: [PATCH 007/252] Fixed typos --- .../grounding_dino/modeling_grounding_dino.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 91129946c6141e..984587d3997d67 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -862,7 +862,7 @@ def __init__(self, config): super().__init__() self.self_attn = nn.MultiheadAttention( embed_dim=config.d_model, - num_heads=config.num_heads // 2, + num_heads=config.encoder_attention_heads // 2, dropout=config.text_enhancer_dropout ) # Implementation of Feedforward model @@ -875,8 +875,8 @@ def __init__(self, config): self.dropout1 = nn.Dropout(config.text_enhancer_dropout) self.dropout2 = nn.Dropout(config.text_enhancer_dropout) - self.activation = ACT2FN[config.activation_fuction] - self.num_heads = config.num_heads // 2 + self.activation = ACT2FN[config.activation_function] + self.num_heads = config.encoder_attention_heads // 2 def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -1116,7 +1116,7 @@ def __init__(self, config, init_values=1e-4): vision_dim=config.d_model, text_dim=config.d_model, embed_dim=config.encoder_ffn_dim // 2, - num_heads=config.num_heads // 2, + num_heads=config.encoder_attention_heads // 2, dropout=config.fusion_dropout ) @@ -1258,25 +1258,25 @@ def sine_func(x: torch.Tensor): class GroundingDINOEncoderLayer(nn.Module): def __init__(self, config) -> None: - super().__init_() + super().__init__() self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) self.fusion_layer = GroundingDINOFusionLayer(config) self.deformable_layer = GroundingDINODeformableLayer(config) def forward( - self, - vision_features: Tensor, - vision_position_embedding: Tensor, - spatial_shapes: Tensor, - level_start_index: Tensor, - key_padding_mask: Tensor, - reference_points: Tensor, - text_features: Optional[Tensor] = None, - text_attention_mask: Optional[Tensor] = None, - text_position_embedding: Optional[Tensor] = None, - text_self_attention_masks: Optional[Tensor] = None, - text_position_ids: Optional[Tensor] = None - ): + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None + ): bs, n_text, text_dim = text_features.shape if text_position_embedding is None and text_position_ids is None: pos_text = ( From a527a836a55c8110f07cd5ed412546c5cc52e0db Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 4 Sep 2023 13:08:37 -0300 Subject: [PATCH 008/252] Adjusted Encoder --- .../grounding_dino/modeling_grounding_dino.py | 234 +++++++++++++----- 1 file changed, 176 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 984587d3997d67..229c5d89c716f9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -173,6 +173,55 @@ class GroundingDINODecoderOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None +@dataclass +class GroundingDINOEncoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDINOEncoder. This class extends + BaseModelOutput, due to: + - vision and text last hidden states + - vision and text intermediate hidden states + - vision and text attentions + - vision and text cross attentions + + Args: + last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the vision encoder. + last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the text encoder. + hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer + plus the initial embedding outputs. + hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer + plus the initial embedding outputs. + attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + """ + last_hidden_state_vision: torch.FloatTensor = None + last_hidden_state_text: torch.FloatTensor = None + hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + attentions_text: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + @dataclass # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO @@ -892,7 +941,7 @@ def forward( attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) - attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0] + attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks) hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) @@ -900,7 +949,7 @@ def forward( attention_output = self.fc2(self.dropout(hidden_states)) hidden_states = hidden_states + self.dropout2(attention_output) hidden_states = self.layer_norm_after(hidden_states) - return hidden_states + return hidden_states, attention_weights class GroundingDINOBiMultiHeadAttention(nn.Module): def __init__( @@ -933,10 +982,6 @@ def __init__( self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) - self.stable_softmax_2d = True - self.clamp_min_for_underflow = True - self.clamp_max_for_overflow = True - self._reset_parameters() def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): @@ -1068,7 +1113,7 @@ def forward( vision_attn_output = self.out_vision_proj(vision_attn_output) text_attn_output = self.out_text_proj(text_attn_output) - return vision_attn_output, text_attn_output + return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) # Copied from transformers.models.beit.modeling_beit.drop_path def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: @@ -1128,16 +1173,16 @@ def __init__(self, config, init_values=1e-4): def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) text_features = self.layer_norm_text(text_features) - delta_v, delta_l = self.attn( + (delta_v, vision_attn), (delta_t, text_attn) = self.attn( vision_features, text_features, attention_mask_vision=attention_mask_vision, attention_mask_text=attention_mask_text ) - # vision_features, text_features = vision_features + delta_v, text_features + delta_l vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) - text_features = text_features + self.drop_path(self.gamma_l * delta_l) - return vision_features, text_features + text_features = text_features + self.drop_path(self.gamma_l * delta_t) + + return (vision_features, vision_attn), (text_features, text_attn) #NOTE just renamed the class class GroundingDINODeformableLayer(nn.Module): @@ -1263,6 +1308,29 @@ def __init__(self, config) -> None: self.fusion_layer = GroundingDINOFusionLayer(config) self.deformable_layer = GroundingDINODeformableLayer(config) + def get_text_position_embeddings( + self, + text_features: Tensor, + text_position_embedding: Tensor, + text_position_ids: Tensor + ) -> Tensor: + bs, n_text, text_dim = text_features.shape + if text_position_embedding is None and text_position_ids is None: + text_position_embedding = ( + torch.arange(n_text, device=text_features.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + return text_position_embedding + def forward( self, vision_features: Tensor, @@ -1277,35 +1345,28 @@ def forward( text_self_attention_masks: Optional[Tensor] = None, text_position_ids: Optional[Tensor] = None ): - bs, n_text, text_dim = text_features.shape - if text_position_embedding is None and text_position_ids is None: - pos_text = ( - torch.arange(n_text, device=text_features.device) - .float() - .unsqueeze(0) - .unsqueeze(-1) - .repeat(bs, 1, 1) - ) - pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) - if text_position_ids is not None: - text_position_embedding = get_sine_pos_embed( - text_position_ids[..., None], num_pos_feats=256, exchange_xy=False - ) + text_position_embedding = self.get_text_position_embeddings( + text_features, + text_position_embedding, + text_position_ids + ) - vision_features, text_features = self.fusion_layer( + (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( vision_features=vision_features, text_features=text_features, attention_mask_vision=key_padding_mask, attention_mask_text=text_attention_mask, ) - text_features = self.text_enhancer_layer( + (text_features, text_enhanced_attn) = self.text_enhancer_layer( hidden_states=text_features.transpose(0, 1), attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None), + position_embeddings=( + text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None + ), ).transpose(0, 1) - vision_features = self.deformable_layer( + (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, attention_mask=key_padding_mask, position_embeddings=vision_position_embedding, @@ -1314,7 +1375,10 @@ def forward( level_start_index=level_start_index, ) - return vision_features, text_features + return ( + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn) + ) # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO @@ -1538,7 +1602,6 @@ def _set_gradient_checkpointing(self, module, value=False): """ -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO class GroundingDINOEncoder(GroundingDINOPreTrainedModel): """ Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a @@ -1592,26 +1655,31 @@ def get_reference_points(spatial_shapes, valid_ratios, device): def forward( self, - inputs_embeds=None, - attention_mask=None, - position_embeddings=None, - spatial_shapes=None, - level_start_index=None, + vision_features: Tensor, + vision_attention_mask: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, valid_ratios=None, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - 1 for pixel features that are real (i.e. **not masked**), - 0 for pixel features that are padding (i.e. **masked**). [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Position embeddings that are added to the queries and keys in each self-attention layer. spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): Spatial shapes of each feature map. @@ -1619,6 +1687,21 @@ def forward( Starting index of each feature map. valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): Ratio of valid area in each feature level. + text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Flattened text features that are passed to the encoder. + text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`): + Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`): + Position ids for text features. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -1634,41 +1717,76 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = inputs_embeds - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + #TODO check if this is necessary according to original implementation + vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training) - reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None + encoder_vision_states = () if output_hidden_states else None + encoder_text_states = () if output_hidden_states else None + all_attn_fused_text = () if output_attentions else None + all_attn_fused_vision = () if output_attentions else None + all_attn_enhanced_text = () if output_attentions else None + all_attn_deformable = () if output_attentions else None for i, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + # INPUTS FOR ENCODER LAYER + # - vision_features: Tensor, + # - vision_position_embedding: Tensor, + # - spatial_shapes: Tensor, + # - level_start_index: Tensor, + # - key_padding_mask: Tensor, + # - reference_points: Tensor, + # - text_features: Optional[Tensor] = None, + # - text_attention_mask: Optional[Tensor] = None, + # - text_position_embedding: Optional[Tensor] = None, + # - text_self_attention_masks: Optional[Tensor] = None, + # - text_position_ids: Optional[Tensor] = None + (vision_features, text_features), attentions = encoder_layer( + vision_features=vision_features, + vision_position_embedding=vision_position_embedding, spatial_shapes=spatial_shapes, level_start_index=level_start_index, - output_attentions=output_attentions, + key_padding_mask=vision_attention_mask, + reference_points=reference_points, + text_features=text_features, + text_attention_mask=text_attention_mask, + text_position_embedding=text_position_embedding, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=text_position_ids ) - hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) + all_attn_fused_vision += (attentions[0],) + all_attn_fused_text += (attentions[1],) + all_attn_enhanced_text += (attentions[2],) + all_attn_deformable += (attentions[3],) if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + enc_outputs = [ + vision_features, text_features, + all_attn_fused_vision, all_attn_fused_text, + all_attn_enhanced_text, all_attn_deformable + ] + return tuple(v for v in enc_outputs if v is not None) + return GroundingDINOEncoderOutput( + last_hidden_state_vision=vision_features, + last_hidden_state_text=text_features, + hidden_states_vision=encoder_vision_states, + hidden_states_text=encoder_text_states, + cross_attentions_vision=all_attn_fused_vision, + cross_attentions_text=all_attn_fused_text, + attentions_vision=all_attn_deformable, + attentions_text=all_attn_enhanced_text ) - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ From 791943c244879fd054f161932e598f24045fb8eb Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 4 Sep 2023 13:09:56 -0300 Subject: [PATCH 009/252] Converted encoder to hf --- .../configuration_grounding_dino.py | 2 +- .../convert_grounding_dino_to_hf.py | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 3abf4912ebb651..14e82704cb495b 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -157,7 +157,7 @@ def __init__( backbone_config={"model_type": "swin"}, text_backbone_config="bert-base-uncased", num_channels=3, - num_queries=300, + num_queries=900, max_position_embeddings=1024, encoder_layers=6, encoder_ffn_dim=2048, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index d5ebc9281b8733..f9fc7e87d12bba 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -131,6 +131,88 @@ def create_rename_keys(state_dict, config): if "module.bert" in layer_name: rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) ########################################## TEXT BACKBONE - END + + ########################################## ENCODER - START + deformable_key_mappings = { + 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', + 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', + 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', + 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', + 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', + 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', + 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', + 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', + 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', + 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', + 'linear1.weight': 'deformable_layer.fc1.weight', + 'linear1.bias': 'deformable_layer.fc1.bias', + 'linear2.weight': 'deformable_layer.fc2.weight', + 'linear2.bias': 'deformable_layer.fc2.bias', + 'norm2.weight': 'deformable_layer.final_layer_norm.weight', + 'norm2.bias': 'deformable_layer.final_layer_norm.bias', + } + text_enhancer_key_mappings = { + 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', + 'linear1.weight': 'text_enhancer_layer.fc1.weight', + 'linear1.bias': 'text_enhancer_layer.fc1.bias', + 'linear2.weight': 'text_enhancer_layer.fc2.weight', + 'linear2.bias': 'text_enhancer_layer.fc2.bias', + 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', + 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', + 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', + 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', + } + fusion_key_mappings = { + 'gamma_v': 'fusion_layer.gamma_v', + 'gamma_l': 'fusion_layer.gamma_l', + 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', + 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', + 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', + 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', + 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', + 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', + 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', + 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', + 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', + 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', + 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', + 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', + 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', + 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', + 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', + 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', + } + + for layer in range(config.encoder_layers): + # deformable + for src, dest in deformable_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # text enhance + for src, dest in text_enhancer_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # fusion layers + for src, dest in fusion_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + ########################################## ENCODER - END + + #TODO convert decoder + ########################################## DECODER - START + ########################################## DECODER - END + + #TODO convert head + ########################################## HEAD - START + ########################################## HEAD - END + + #TODO convert additional layers + ########################################## Additional - START + ########################################## Additional - END + # fmt: on return rename_keys @@ -259,6 +341,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Rename keys new_state_dict = original_state_dict.copy() rename_keys = create_rename_keys(original_state_dict, config) + for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) From 9c57788ee857fc77415b285120e71ba386ac0ba0 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 5 Sep 2023 16:10:51 -0300 Subject: [PATCH 010/252] Modified Decoder Layer --- .../grounding_dino/modeling_grounding_dino.py | 51 ++++++++++++++----- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 229c5d89c716f9..9f6edac849f2c9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1388,7 +1388,7 @@ def __init__(self, config: GroundingDINOConfig): self.embed_dim = config.d_model # self-attention - self.self_attn = GroundingDINOMultiheadAttention( + self.self_attn = nn.MultiheadAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, @@ -1398,6 +1398,13 @@ def __init__(self, config: GroundingDINOConfig): self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention text + self.encoder_attn_text = nn.MultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention self.encoder_attn = GroundingDINOMultiscaleDeformableAttention( config, @@ -1410,6 +1417,9 @@ def __init__(self, config: GroundingDINOConfig): self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim) + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + def forward( self, hidden_states: torch.Tensor, @@ -1417,8 +1427,11 @@ def forward( reference_points=None, spatial_shapes=None, level_start_index=None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, + vision_encoder_hidden_states: Optional[torch.Tensor] = None, + vision_encoder_attention_mask: Optional[torch.Tensor] = None, + text_encoder_hidden_states: Optional[torch.Tensor] = None, + text_encoder_attention_mask: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ): """ @@ -1446,9 +1459,10 @@ def forward( # Self Attention hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - output_attentions=output_attentions, + query=self.with_pos_embed(hidden_states, position_embeddings), + key=self.with_pos_embed(hidden_states, position_embeddings), + value=hidden_states, + attn_mask=self_attn_mask ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1457,13 +1471,27 @@ def forward( second_residual = hidden_states + # Cross-Attention Text + hidden_states, text_cross_attn_weights = self.encoder_attn_text( + query=self.with_pos_embed(hidden_states, position_embeddings), + key=text_encoder_hidden_states.transpose(0, 1), + value=text_encoder_hidden_states.transpose(0, 1), + attn_mask=text_encoder_attention_mask, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + hidden_states = self.encoder_attn_text_layer_norm(hidden_states) + + third_residual = hidden_states + # Cross-Attention cross_attn_weights = None hidden_states, cross_attn_weights = self.encoder_attn( hidden_states=hidden_states, - attention_mask=encoder_attention_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, + attention_mask=vision_encoder_attention_mask, + encoder_hidden_states=vision_encoder_hidden_states, + encoder_attention_mask=vision_encoder_attention_mask, position_embeddings=position_embeddings, reference_points=reference_points, spatial_shapes=spatial_shapes, @@ -1472,8 +1500,7 @@ def forward( ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = second_residual + hidden_states - + hidden_states = third_residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) # Fully Connected @@ -1488,7 +1515,7 @@ def forward( outputs = (hidden_states,) if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) + outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights) return outputs From 962ce238a3fefb2f1f9735bb2843965a666f9f1b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 6 Sep 2023 14:33:57 -0300 Subject: [PATCH 011/252] Modified main decoder class --- .../configuration_grounding_dino.py | 6 +-- .../convert_grounding_dino_to_hf.py | 37 ++++++++++++++ .../grounding_dino/modeling_grounding_dino.py | 49 +++++++++++++------ 3 files changed, 73 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 14e82704cb495b..33de7c666cef19 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -163,7 +163,7 @@ def __init__( encoder_ffn_dim=2048, encoder_attention_heads=8, decoder_layers=6, - decoder_ffn_dim=1024, + decoder_ffn_dim=2048, decoder_attention_heads=8, encoder_layerdrop=0.0, is_encoder_decoder=True, @@ -183,9 +183,9 @@ def __init__( num_feature_levels=4, encoder_n_points=4, decoder_n_points=4, - two_stage=False, + two_stage=True, two_stage_num_proposals=300, - with_box_refine=False, + with_box_refine=True, class_cost=1, bbox_cost=5, giou_cost=2, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index f9fc7e87d12bba..846892980d2d21 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -203,6 +203,43 @@ def create_rename_keys(state_dict, config): #TODO convert decoder ########################################## DECODER - START + key_mappings_decoder = { + 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', + 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', + 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', + 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', + 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', + 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', + 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', + 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', + 'norm1.weight': 'encoder_attn_layer_norm.weight', + 'norm1.bias': 'encoder_attn_layer_norm.bias', + 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', + 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', + 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', + 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', + 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', + 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', + 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', + 'norm2.weight': 'self_attn_layer_norm.weight', + 'norm2.bias': 'self_attn_layer_norm.bias', + 'linear1.weight': 'fc1.weight', + 'linear1.bias': 'fc1.bias', + 'linear2.weight': 'fc2.weight', + 'linear2.bias': 'fc2.bias', + 'norm3.weight': 'final_layer_norm.weight', + 'norm3.bias': 'final_layer_norm.bias', + } + for layer_num in range(config.decoder_layers): + source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.' + target_prefix_decoder = f'model.decoder.layers.{layer_num}.' + + for source_name, target_name in key_mappings_decoder.items(): + rename_keys.append((source_prefix_decoder + source_name, + target_prefix_decoder + target_name)) ########################################## DECODER - END #TODO convert head diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 9f6edac849f2c9..d57e823199703a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -160,10 +160,14 @@ class GroundingDINODecoderOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. + text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the text cross-attention heads. """ last_hidden_state: torch.FloatTensor = None @@ -171,7 +175,8 @@ class GroundingDINODecoderOutput(ModelOutput): intermediate_reference_points: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None @dataclass class GroundingDINOEncoderOutput(ModelOutput): @@ -1814,7 +1819,6 @@ def forward( attentions_text=all_attn_enhanced_text ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. @@ -1840,20 +1844,24 @@ def __init__(self, config: GroundingDINOConfig): # hack implementation for iterative bounding box refinement and two-stage Deformable DETR self.bbox_embed = None self.class_embed = None + self.query_scale = None # Initialize weights and apply final processing self.post_init() def forward( self, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, + inputs_embeds, + vision_encoder_hidden_states, + vision_encoder_attention_mask=None, + text_encoder_hidden_states=None, + text_encoder_attention_mask=None, position_embeddings=None, reference_points=None, spatial_shapes=None, level_start_index=None, valid_ratios=None, + self_attn_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, @@ -1902,7 +1910,8 @@ def forward( # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None + all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None intermediate = () intermediate_reference_points = () @@ -1930,20 +1939,23 @@ def custom_forward(*inputs): layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, - encoder_hidden_states, - encoder_attention_mask, + vision_encoder_hidden_states, + vision_encoder_attention_mask, None, ) else: layer_outputs = decoder_layer( - hidden_states, + hidden_states=hidden_states, position_embeddings=position_embeddings, - encoder_hidden_states=encoder_hidden_states, reference_points=reference_points_input, spatial_shapes=spatial_shapes, level_start_index=level_start_index, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, + vision_encoder_hidden_states=vision_encoder_hidden_states, + vision_encoder_attention_mask=vision_encoder_attention_mask, + text_encoder_hidden_states=text_encoder_hidden_states, + text_encoder_attention_mask=text_encoder_attention_mask, + self_attn_mask=self_attn_mask, + output_attentions=output_attentions ) hidden_states = layer_outputs[0] @@ -1970,8 +1982,12 @@ def custom_forward(*inputs): if output_attentions: all_self_attns += (layer_outputs[1],) - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) + if text_encoder_hidden_states is not None: + all_cross_attns_text += (layer_outputs[2],) + + if vision_encoder_hidden_states is not None: + all_cross_attns_vision += (layer_outputs[3],) + # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) @@ -2000,7 +2016,8 @@ def custom_forward(*inputs): intermediate_reference_points=intermediate_reference_points, hidden_states=all_hidden_states, attentions=all_self_attns, - cross_attentions=all_cross_attentions, + vision_cross_attentions=all_cross_attns_vision, + text_cross_attentions=all_cross_attns_text ) From 9aedd7f0445db756e18217cc2ad7f6e67140f4c1 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 6 Sep 2023 14:38:56 -0300 Subject: [PATCH 012/252] Removed copy comments --- .../models/grounding_dino/modeling_grounding_dino.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index d57e823199703a..8cd584c1fcc71c 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -137,7 +137,6 @@ def backward(context, grad_output): @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO class GroundingDINODecoderOutput(ModelOutput): """ Base class for outputs of the GroundingDINODecoder. This class adds two attributes to @@ -1153,7 +1152,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) - class GroundingDINOFusionLayer(nn.Module): def __init__(self, config, init_values=1e-4): super().__init__() @@ -1386,7 +1384,6 @@ def forward( ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO class GroundingDINODecoderLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() @@ -2006,7 +2003,8 @@ def custom_forward(*inputs): intermediate_reference_points, all_hidden_states, all_self_attns, - all_cross_attentions, + all_cross_attns_vision, + all_cross_attns_text ] if v is not None ) From 65fb442af03ca42ea9254757125648cb2fa9a446 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 10 Sep 2023 23:21:17 -0300 Subject: [PATCH 013/252] Fixed forward from GroundingDINOModel and GroundingDINODecoder --- .../configuration_grounding_dino.py | 14 ++ .../convert_grounding_dino_to_hf.py | 9 + .../grounding_dino/modeling_grounding_dino.py | 190 +++++++++++++----- 3 files changed, 162 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 33de7c666cef19..bc43655df050ee 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -130,6 +130,18 @@ class GroundingDINOConfig(PretrainedConfig): disable_custom_kernels (`bool`, *optional*, defaults to `False`): Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom kernels are not supported by PyTorch ONNX export. + max_text_len (`int`, *optional*, defaults to 256): + The maximum length of the text input. + sub_sentence_present (`bool`, *optional*, defaults to `True`): + Whether to use sub-sentence present in the text input. + text_enhancer_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the text enhancer. + fusion_droppath (`float`, *optional*, defaults to 0.1): + The droppath ratio for the fusion module. + fusion_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the fusion module. + embedding_init_target (`bool`, *optional*, defaults to `True`): + Whether to initialize the target with Embedding weights. Examples: @@ -202,6 +214,7 @@ def __init__( text_enhancer_dropout = 0.0, fusion_droppath = 0.1, fusion_dropout = 0.0, + embedding_init_target = True, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -269,6 +282,7 @@ def __init__( # Fusion self.fusion_droppath = fusion_droppath self.fusion_dropout = fusion_dropout + self.embedding_init_target = embedding_init_target super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 846892980d2d21..efced9cba0d522 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -248,6 +248,15 @@ def create_rename_keys(state_dict, config): #TODO convert additional layers ########################################## Additional - START + for layer_name, params in state_dict.items(): + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE + if "module.input_proj" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE + if "module.feat_map" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text"))) + #### + ########################################## Additional - END # fmt: on diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8cd584c1fcc71c..35ed14fa6859bc 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1521,6 +1521,27 @@ def forward( return outputs +class GroundingDINOContrastiveEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.max_text_len = config.max_text_len + + def forward( + self, + vision_hidden_state: torch.FloatTensor, + text_hiddend_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor + ) -> torch.FloatTensor: + + + output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2) + output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device) + new_output[..., : output.shape[-1]] = output + + return new_output # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead class GroundingDINOClassificationHead(nn.Module): @@ -1836,6 +1857,12 @@ def __init__(self, config: GroundingDINOConfig): self.dropout = config.dropout self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = GroundingDINOMLPPredictionHead( + config.query_dim // 2 * config.d_model, + config.d_model, + config.d_model, + 2 + ) self.gradient_checkpointing = False # hack implementation for iterative bounding box refinement and two-stage Deformable DETR @@ -1846,6 +1873,45 @@ def __init__(self, config: GroundingDINOConfig): # Initialize weights and apply final processing self.post_init() + def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTensor: + """Get the position embedding of the proposals.""" + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries + pos_x = proposals[:, :, 0] * scale + pos_y = proposals[:, :, 1] * scale + # batch_size, num_queries, num_pos_feats + pos_x = pos_x[:, :, None] / dim_t + pos_y = pos_y[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + + if proposals.size(-1) == 2: + # batch_size, num_queries, num_pos_feats * 2 + pos = torch.cat((pos_y, pos_x), dim=2) + elif proposals.size(-1) == 4: + w_embed = proposals[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = proposals[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + # batch_size, num_queries, num_pos_feats * 4 + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1))) + return pos + + + def forward( self, inputs_embeds, @@ -1853,7 +1919,6 @@ def forward( vision_encoder_attention_mask=None, text_encoder_hidden_states=None, text_encoder_attention_mask=None, - position_embeddings=None, reference_points=None, spatial_shapes=None, level_start_index=None, @@ -1875,8 +1940,6 @@ def forward( in `[0, 1]`: - 1 for pixels that are real (i.e. **not masked**), - 0 for pixels that are padding (i.e. **masked**). - position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Position embeddings that are added to the queries and keys in each self-attention layer. reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): @@ -1921,6 +1984,8 @@ def forward( if reference_points.shape[-1] != 2: raise ValueError("Reference points' last dimension must be of size 2") reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :]) + query_pos = self.reference_points_head(query_pos) if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1943,7 +2008,7 @@ def custom_forward(*inputs): else: layer_outputs = decoder_layer( hidden_states=hidden_states, - position_embeddings=position_embeddings, + position_embeddings=query_pos, reference_points=reference_points_input, spatial_shapes=spatial_shapes, level_start_index=level_start_index, @@ -2034,8 +2099,6 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) - # Create text backbone - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -2057,9 +2120,9 @@ def __init__(self, config: GroundingDINOConfig): ) ) in_channels = config.d_model - self.input_proj = nn.ModuleList(input_proj_list) + self.input_proj_vision = nn.ModuleList(input_proj_list) else: - self.input_proj = nn.ModuleList( + self.input_proj_vision = nn.ModuleList( [ nn.Sequential( nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), @@ -2068,8 +2131,12 @@ def __init__(self, config: GroundingDINOConfig): ] ) - if not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + # Create text backbone + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) + self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) + + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) self.encoder = GroundingDINOEncoder(config) self.decoder = GroundingDINODecoder(config) @@ -2079,10 +2146,8 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) - self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) - self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) else: - self.reference_points = nn.Linear(config.d_model, 2) + self.reference_points = nn.Embedding(config.num_queries, 4) self.post_init() @@ -2164,6 +2229,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) proposals.append(proposal) _cur += height * width + output_proposals = torch.cat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid @@ -2181,12 +2247,15 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, - pixel_values, - pixel_mask=None, - decoder_attention_mask=None, + pixel_values: Tensor, + input_ids: Tensor, + attention_mask: Tensor, + token_type_ids: Tensor, + text_token_mask: Tensor, + text_self_attention_masks: Tensor, + position_ids: Tensor, + pixel_mask: Optional[Tensor]=None, encoder_outputs=None, - inputs_embeds=None, - decoder_inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, @@ -2221,6 +2290,10 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Extract text features from text backbone + text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"] + text_features = self.input_proj_text(text_features) + batch_size, num_channels, height, width = pixel_values.shape device = pixel_values.device @@ -2230,13 +2303,13 @@ def forward( # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) # First, sent pixel_values + pixel_mask through Backbone to obtain the features # which is a list of tuples - features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) sources = [] masks = [] - for level, (source, mask) in enumerate(features): - sources.append(self.input_proj[level](source)) + for level, (source, mask) in enumerate(vision_features): + sources.append(self.input_proj_vision[level](source)) masks.append(mask) if mask is None: raise ValueError("No attention mask was provided") @@ -2246,9 +2319,9 @@ def forward( _len_sources = len(sources) for level in range(_len_sources, self.config.num_feature_levels): if level == _len_sources: - source = self.input_proj[level](features[-1][0]) + source = self.input_proj_vision[level](vision_features[-1][0]) else: - source = self.input_proj[level](sources[-1]) + source = self.input_proj_vision[level](sources[-1]) mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) sources.append(source) @@ -2257,7 +2330,7 @@ def forward( # Create queries query_embeds = None - if not self.config.two_stage: + if self.config.embedding_init_target or self.config.two_stage: query_embeds = self.query_position_embeddings.weight # Prepare encoder inputs (by flattening) @@ -2288,26 +2361,35 @@ def forward( # Also provide spatial_shapes, level_start_index and valid_ratios if encoder_outputs is None: encoder_outputs = self.encoder( - inputs_embeds=source_flatten, - attention_mask=mask_flatten, - position_embeddings=lvl_pos_embed_flatten, + vision_features=source_flatten, + vision_attention_mask=mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=text_token_mask, + text_position_embedding=None, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=return_dict ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput): + encoder_outputs = GroundingDINOEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, + attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None, + attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None, + cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None, + cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None, ) # Fifth, prepare decoder inputs - batch_size, _, num_channels = encoder_outputs[0].shape enc_outputs_class = None enc_outputs_coord_logits = None if self.config.two_stage: @@ -2318,14 +2400,19 @@ def forward( # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + enc_outputs_class = self.decoder.class_embed[-1]( + object_query_embedding, + encoder_outputs[1], + text_token_mask + ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) enc_outputs_coord_logits = delta_bbox + output_proposals # only keep top scoring `config.two_stage_num_proposals` proposals topk = self.config.two_stage_num_proposals - topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] topk_coords_logits = torch.gather( enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) ) @@ -2333,27 +2420,31 @@ def forward( topk_coords_logits = topk_coords_logits.detach() reference_points = topk_coords_logits.sigmoid() init_reference_points = reference_points - pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) - query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + if query_embeds: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() else: - query_embed, target = torch.split(query_embeds, num_channels, dim=1) - query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) - target = target.unsqueeze(0).expand(batch_size, -1, -1) - reference_points = self.reference_points(query_embed).sigmoid() + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() init_reference_points = reference_points decoder_outputs = self.decoder( inputs_embeds=target, - position_embeddings=query_embed, - encoder_hidden_states=encoder_outputs[0], - encoder_attention_mask=mask_flatten, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=text_token_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, + self_attn_mask=None, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=return_dict ) if not return_dict: @@ -2396,14 +2487,11 @@ def __init__(self, config: GroundingDINOConfig): self.model = GroundingDINOModel(config) # Detection heads on top - self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.class_embed = GroundingDINOContrastiveEmbedding(config) self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - prior_prob = 0.01 - bias_value = -math.log((1 - prior_prob) / prior_prob) - self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) From fd6ba8768c694b815df31b2293bff7516847763d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 11 Sep 2023 23:40:10 -0300 Subject: [PATCH 014/252] Added all necessary layers, configurations and forward logic up to GroundingDINOModel --- .../configuration_grounding_dino.py | 19 +++++++ .../grounding_dino/modeling_grounding_dino.py | 52 +++++++++++-------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index bc43655df050ee..e413d43b55cd89 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -142,6 +142,14 @@ class GroundingDINOConfig(PretrainedConfig): The dropout ratio for the fusion module. embedding_init_target (`bool`, *optional*, defaults to `True`): Whether to initialize the target with Embedding weights. + query_dim (`int`, *optional*, defaults to 4): + The dimension of the query vector. + decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): + Whether to share the bbox embedding between the decoder and the two-stage bbox generator. + two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. + two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. Examples: @@ -215,6 +223,10 @@ def __init__( fusion_droppath = 0.1, fusion_dropout = 0.0, embedding_init_target = True, + query_dim = 4, + decoder_bbox_embed_share = True, + two_stage_bbox_embed_share = False, + two_stage_class_embed_share = False, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -282,7 +294,14 @@ def __init__( # Fusion self.fusion_droppath = fusion_droppath self.fusion_dropout = fusion_dropout + # Others self.embedding_init_target = embedding_init_target + self.query_dim = query_dim + self.decoder_bbox_embed_share = decoder_bbox_embed_share + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + if two_stage_bbox_embed_share and not decoder_bbox_embed_share: + raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") + self.two_stage_class_embed_share = two_stage_class_embed_share super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 35ed14fa6859bc..4c35a8cf4b7814 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1856,6 +1856,7 @@ def __init__(self, config: GroundingDINOConfig): super().__init__(config) self.dropout = config.dropout + self.layer_norm = nn.LayerNorm(config.d_model) self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.reference_points_head = GroundingDINOMLPPredictionHead( config.query_dim // 2 * config.d_model, @@ -2038,7 +2039,7 @@ def custom_forward(*inputs): new_reference_points = new_reference_points.sigmoid() reference_points = new_reference_points.detach() - intermediate += (hidden_states,) + intermediate += (self.layer_norm(hidden_states),) intermediate_reference_points += (reference_points,) if output_attentions: @@ -2146,6 +2147,8 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) + self.encoder_output_bbox_embed = None + self.encoder_output_class_embed = None else: self.reference_points = nn.Embedding(config.num_queries, 4) @@ -2400,13 +2403,13 @@ def forward( # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1]( + enc_outputs_class = self.encoder_output_class_embed( object_query_embedding, encoder_outputs[1], text_token_mask ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) - delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) enc_outputs_coord_logits = delta_bbox + output_proposals # only keep top scoring `config.two_stage_num_proposals` proposals @@ -2487,32 +2490,35 @@ def __init__(self, config: GroundingDINOConfig): self.model = GroundingDINOModel(config) # Detection heads on top - self.class_embed = GroundingDINOContrastiveEmbedding(config) - self.bbox_embed = GroundingDINOMLPPredictionHead( + _class_embed = GroundingDINOContrastiveEmbedding(config) + _bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) - nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) - # if two-stage, the last class_embed and bbox_embed is for region proposal generation - num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers - if config.with_box_refine: - self.class_embed = _get_clones(self.class_embed, num_pred) - self.bbox_embed = _get_clones(self.bbox_embed, num_pred) - nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) - # hack implementation for iterative bounding box refinement - self.model.decoder.bbox_embed = self.bbox_embed + + if config.decoder_bbox_embed_share: + self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) else: - nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) - self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) - self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) - self.model.decoder.bbox_embed = None + self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) + self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) + # hack implementation for two-stage + self.model.decoder.bbox_embed = self.bbox_embed + self.model.decoder.class_embed = self.class_embed + if config.two_stage: - # hack implementation for two-stage - self.model.decoder.class_embed = self.class_embed - for box_embed in self.bbox_embed: - nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + if config.two_stage_bbox_embed_share: + self.model.encoder_output_bbox_embed = _bbox_embed + else: + self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed) + + #TODO don't believe this is necessary since class_embed has no parameters + if config.two_stage_class_embed_share: + self.model.encoder_output_class_embed = _class_embed + else: + self.model.encoder_output_class_embed = copy.deepcopy(_class_embed) # Initialize weights and apply final processing self.post_init() From dca093b25bffa2a13ccd2cc7d292601ef83a51a3 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 12 Sep 2023 00:16:28 -0300 Subject: [PATCH 015/252] Added all layers to convertion --- .../convert_grounding_dino_to_hf.py | 101 ++++++++++-------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index efced9cba0d522..4c74404b19b288 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -66,72 +66,66 @@ def create_rename_keys(state_dict, config): #TODO names might change after modifing GroundingDINOModel class ########################################## VISION BACKBONE - START # patch embedding layer - rename_keys.append(("module.backbone.0.patch_embed.proj.weight", + rename_keys.append(("backbone.0.patch_embed.proj.weight", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("module.backbone.0.patch_embed.proj.bias", + rename_keys.append(("backbone.0.patch_embed.proj.bias", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("module.backbone.0.patch_embed.norm.weight", + rename_keys.append(("backbone.0.patch_embed.norm.weight", "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("module.backbone.0.patch_embed.norm.bias", + rename_keys.append(("backbone.0.patch_embed.norm.bias", "model.backbone.conv_encoder.model.embeddings.norm.bias")) for layer, depth in enumerate(config.backbone_config.depths): for block in range(depth): # layernorms - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) # intermidiate - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) # output - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) # downsample if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) for out_indice in config.backbone_config.out_indices: # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", + rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", + rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) ########################################## VISION BACKBONE - END - ########################################## TEXT BACKBONE - START - for layer_name, params in state_dict.items(): - if "module.bert" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) - ########################################## TEXT BACKBONE - END - ########################################## ENCODER - START deformable_key_mappings = { 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', @@ -185,23 +179,21 @@ def create_rename_keys(state_dict, config): 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', } - for layer in range(config.encoder_layers): # deformable for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # text enhance for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # fusion layers for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) ########################################## ENCODER - END - #TODO convert decoder ########################################## DECODER - START key_mappings_decoder = { 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', @@ -234,7 +226,7 @@ def create_rename_keys(state_dict, config): 'norm3.bias': 'final_layer_norm.bias', } for layer_num in range(config.decoder_layers): - source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.' + source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' target_prefix_decoder = f'model.decoder.layers.{layer_num}.' for source_name, target_name in key_mappings_decoder.items(): @@ -246,17 +238,36 @@ def create_rename_keys(state_dict, config): ########################################## HEAD - START ########################################## HEAD - END - #TODO convert additional layers ########################################## Additional - START for layer_name, params in state_dict.items(): + #### TEXT BACKBONE + if "bert" in layer_name: + rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE - if "module.input_proj" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE - if "module.feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text"))) - #### - + if "input_proj" in layer_name: + rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE + if "feat_map" in layer_name: + rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text"))) + #### DECODER REFERENCE POINT HEAD + if "transformer.decoder.ref_point_head" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", + "model.decoder.reference_points_head"))) + #### DECODER BBOX EMBED + if "transformer.decoder.bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", + "model.decoder.bbox_embed"))) + if "transformer.enc_output" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) + + if "transformer.enc_out_bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", + "model.encoder_output_bbox_embed"))) + + rename_keys.append(("transformer.level_embed", "model.level_embed")) + rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) + rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) + rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) ########################################## Additional - END # fmt: on @@ -274,8 +285,8 @@ def read_in_q_k_v(state_dict, config): hidden_size = embed_dim * 2**layer for block in range(depth): # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") + in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") # next, add query, keys and values (in that order) to the state dict state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] @@ -382,7 +393,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): config = get_grounding_dino_config(model_name) # Load original checkpoint - original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + original_state_dict = torch.load(checkpoint_path, map_location="cpu") # Rename keys new_state_dict = original_state_dict.copy() @@ -452,7 +463,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) parser.add_argument( "--checkpoint_path", - default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth", + default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", type=str, help="Path to the original PyTorch checkpoint (.pth file).", ) From cba79882fc3a78dffd432511171966f920937dc9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 12 Sep 2023 11:24:24 -0300 Subject: [PATCH 016/252] Fixed outputs for GroundingDINOModel and GroundingDINOForObjectDetection --- .../grounding_dino/modeling_grounding_dino.py | 156 +++++++++++++----- 1 file changed, 113 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 4c35a8cf4b7814..c3d094285dcf0d 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -228,10 +228,9 @@ class GroundingDINOEncoderOutput(ModelOutput): @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOModelOutput(ModelOutput): """ - Base class for outputs of the Deformable DETR encoder-decoder model. + Base class for outputs of the Grounding DINO encoder-decoder model. Args: init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): @@ -250,25 +249,47 @@ class GroundingDINOModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the - self-attention heads. - enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and background). - enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): Logits of predicted bounding boxes coordinates in the first stage. """ @@ -278,16 +299,21 @@ class GroundingDINOModelOutput(ModelOutput): intermediate_reference_points: torch.FloatTensor = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO class GroundingDINOObjectDetectionOutput(ModelOutput): """ Output type of [`GroundingDINOForObjectDetection`]. @@ -320,20 +346,42 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, - 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average - in the self-attention heads. + encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -359,12 +407,18 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): intermediate_reference_points: Optional[torch.FloatTensor] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - enc_outputs_class: Optional = None - enc_outputs_coord_logits: Optional = None + decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None def _get_clones(module, N): @@ -1988,8 +2042,11 @@ def forward( query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :]) query_pos = self.reference_points_head(query_pos) + # In original implementation they apply layer norm before outputting intermediate hidden states + # Though that's not through between layers so the layers use as input the output of the previous layer + # withtout layer norm if output_hidden_states: - all_hidden_states += (hidden_states,) + all_hidden_states += (self.layer_norm(hidden_states),) if self.gradient_checkpointing and self.training: @@ -2055,6 +2112,7 @@ def custom_forward(*inputs): # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + hidden_states = self.layer_norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: @@ -2463,10 +2521,16 @@ def forward( intermediate_reference_points=decoder_outputs.intermediate_reference_points, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, - cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, + decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions, + decoder_cross_attentions_text=decoder_outputs.text_cross_attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, + encoder_hidden_states_text=encoder_outputs.hidden_states_text, + encoder_attentions_vision=encoder_outputs.attentions_vision, + encoder_attentions_text=encoder_outputs.attentions_text, + encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision, + encoder_cross_attentions_text=encoder_outputs.cross_attentions_text, enc_outputs_class=enc_outputs_class, enc_outputs_coord_logits=enc_outputs_coord_logits, ) @@ -2588,7 +2652,7 @@ def forward( ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # First, sent images through DETR base model to obtain encoder + decoder outputs + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( pixel_values, pixel_mask=pixel_mask, @@ -2688,10 +2752,16 @@ def forward( last_hidden_state=outputs.last_hidden_state, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, + decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision, + decoder_cross_attentions_text=outputs.decoder_cross_attentions_text, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_hidden_states_vision=outputs.encoder_hidden_states_vision, + encoder_hidden_states_text=outputs.encoder_hidden_states_text, + encoder_attentions_vision=outputs.encoder_attentions_vision, + encoder_attentions_text=outputs.encoder_attentions_text, + encoder_cross_attentions_text=outputs.encoder_cross_attentions_text, + encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision, intermediate_hidden_states=outputs.intermediate_hidden_states, intermediate_reference_points=outputs.intermediate_reference_points, init_reference_points=outputs.init_reference_points, From d47864e3ed02f0a25f22eb505bb09f18f7fe6db0 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 11:58:02 -0300 Subject: [PATCH 017/252] Fixed mask input to encoders and fixed nn.MultiheadAttention batch first and attn output --- .../convert_grounding_dino_to_hf.py | 30 ++++----- .../grounding_dino/modeling_grounding_dino.py | 61 ++++++++++++------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 4c74404b19b288..15793a0df03ae7 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -385,7 +385,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token tokenized_for_encoder["attention_mask"] = text_self_attention_masks tokenized_for_encoder["position_ids"] = position_ids - return tokenized_for_encoder + return tokenized_for_encoder, tokenized.attention_mask.bool() @torch.no_grad() def convert_grounding_dino_checkpoint(model_name, checkpoint_path): @@ -418,25 +418,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ] ) image_inputs = image_processor(image) - text_inputs = text_processor(text, config) - - pixel_mask = torch.ones( - ((1, image_inputs.shape[1], image_inputs.shape[2])), - dtype=torch.long, - device=image_inputs.device + text_inputs, text_token_mask = text_processor(text, config) + + outputs = model( + pixel_values=image_inputs.unsqueeze(0), + input_ids=text_inputs["input_ids"], + attention_mask=text_inputs["attention_mask"], + token_type_ids=text_inputs["token_type_ids"], + text_token_mask=text_token_mask, + text_self_attention_masks=text_inputs["attention_mask"], + position_ids=text_inputs["position_ids"], ) - # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0)) - output = model.model.text_backbone(**text_inputs) - print(output.last_hidden_state[:, :, :5]) - - # for feature_map in output.last_hidden_state: - # print(f"{feature_map.shape}") - # print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") - - # outputs = model(**inputs).logits - - # print(outputs.keys()) - # print("Looks ok!") # if pytorch_dump_folder_path is not None: # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index c3d094285dcf0d..2cc715b10cce4f 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -970,7 +970,8 @@ def __init__(self, config): self.self_attn = nn.MultiheadAttention( embed_dim=config.d_model, num_heads=config.encoder_attention_heads // 2, - dropout=config.text_enhancer_dropout + dropout=config.text_enhancer_dropout, + batch_first=True, ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) @@ -999,7 +1000,13 @@ def forward( attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) - attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks) + attention_output, attention_weights = self.self_attn( + query=q, + key=k, + value=hidden_states, + attn_mask=attention_masks, + average_attn_weights=False + ) hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) @@ -1233,8 +1240,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at (delta_v, vision_attn), (delta_t, text_attn) = self.attn( vision_features, text_features, - attention_mask_vision=attention_mask_vision, - attention_mask_text=attention_mask_text + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text ) vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) text_features = text_features + self.drop_path(self.gamma_l * delta_t) @@ -1448,6 +1455,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, + batch_first=True ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] @@ -1459,6 +1467,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, + batch_first=True ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention @@ -1518,7 +1527,8 @@ def forward( query=self.with_pos_embed(hidden_states, position_embeddings), key=self.with_pos_embed(hidden_states, position_embeddings), value=hidden_states, - attn_mask=self_attn_mask + attn_mask=self_attn_mask, + average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1533,6 +1543,7 @@ def forward( key=text_encoder_hidden_states.transpose(0, 1), value=text_encoder_hidden_states.transpose(0, 1), attn_mask=text_encoder_attention_mask, + average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -2423,13 +2434,13 @@ def forward( if encoder_outputs is None: encoder_outputs = self.encoder( vision_features=source_flatten, - vision_attention_mask=mask_flatten, + vision_attention_mask=~mask_flatten, vision_position_embedding=lvl_pos_embed_flatten, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, text_features=text_features, - text_attention_mask=text_token_mask, + text_attention_mask=~text_token_mask, text_position_embedding=None, text_self_attention_masks=text_self_attention_masks, text_position_ids=position_ids, @@ -2599,16 +2610,19 @@ def _set_aux_loss(self, outputs_class, outputs_coord): @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( self, - pixel_values, - pixel_mask=None, - decoder_attention_mask=None, - encoder_outputs=None, - inputs_embeds=None, - decoder_inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + attention_mask: torch.BoolTensor, + token_type_ids: torch.LongTensor, + text_token_mask: torch.BoolTensor, + text_self_attention_masks: torch.BoolTensor, + position_ids: torch.LongTensor, + pixel_mask: Optional[torch.BoolTensor]=None, + encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ): r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2654,12 +2668,15 @@ def forward( # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( - pixel_values, - pixel_mask=pixel_mask, - decoder_attention_mask=decoder_attention_mask, + pixel_values=pixel_values , + input_ids=input_ids , + attention_mask=attention_mask , + token_type_ids=token_type_ids , + text_token_mask=text_token_mask , + text_self_attention_masks=text_self_attention_masks , + position_ids=position_ids , + pixel_mask=pixel_mask , encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, From b9f9553010ccef357ea9e0cbb509aed59f5957f4 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 14:14:57 -0300 Subject: [PATCH 018/252] Fixed forward from GroundingDINOTextEnhancerLayer --- .../grounding_dino/modeling_grounding_dino.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2cc715b10cce4f..36822d53eaa9ab 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -975,16 +975,14 @@ def __init__(self, config): ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) - self.dropout = nn.Dropout(config.text_enhancer_dropout) self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) self.layer_norm_before = nn.LayerNorm(config.d_model) self.layer_norm_after = nn.LayerNorm(config.d_model) - self.dropout1 = nn.Dropout(config.text_enhancer_dropout) - self.dropout2 = nn.Dropout(config.text_enhancer_dropout) self.activation = ACT2FN[config.activation_function] self.num_heads = config.encoder_attention_heads // 2 + self.dropout = config.text_enhancer_dropout def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -995,7 +993,7 @@ def forward( attention_masks: Optional[Tensor] = None, position_embeddings: Optional[Tensor] = None, ): # repeat attn mask - if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]: + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) @@ -1007,13 +1005,18 @@ def forward( attn_mask=attention_masks, average_attn_weights=False ) + attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) + hidden_states = hidden_states + attention_output + residual = hidden_states - hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) hidden_states = self.activation(self.fc1(hidden_states)) - attention_output = self.fc2(self.dropout(hidden_states)) - hidden_states = hidden_states + self.dropout2(attention_output) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = hidden_states + residual hidden_states = self.layer_norm_after(hidden_states) + return hidden_states, attention_weights class GroundingDINOBiMultiHeadAttention(nn.Module): @@ -1423,12 +1426,10 @@ def forward( ) (text_features, text_enhanced_attn) = self.text_enhancer_layer( - hidden_states=text_features.transpose(0, 1), + hidden_states=text_features, attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=( - text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None - ), - ).transpose(0, 1) + position_embeddings=(text_position_embedding if text_position_embedding is not None else None) + ) (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, From 35d6639cd226ff0e5b9f4deebce25dc9b3ade2ab Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 14:31:17 -0300 Subject: [PATCH 019/252] Fixed output bug with GroundingDINODeformableLayer --- .../models/grounding_dino/modeling_grounding_dino.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 36822d53eaa9ab..e8e147cb00554a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1329,12 +1329,7 @@ def forward( clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights def get_sine_pos_embed( pos_tensor: torch.Tensor, From 23d9048ccca9b1c7210f9b45e78a4be3a8079b51 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 15 Sep 2023 18:57:37 -0300 Subject: [PATCH 020/252] Fixed bugs that prevent GroundingDINOForObjectDetection to run forward method --- .../configuration_grounding_dino.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e413d43b55cd89..3a62780362d834 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -204,7 +204,7 @@ def __init__( encoder_n_points=4, decoder_n_points=4, two_stage=True, - two_stage_num_proposals=300, + two_stage_num_proposals=900, with_box_refine=True, class_cost=1, bbox_cost=5, diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index e8e147cb00554a..2e9d7d3d0de7f5 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1536,9 +1536,9 @@ def forward( # Cross-Attention Text hidden_states, text_cross_attn_weights = self.encoder_attn_text( query=self.with_pos_embed(hidden_states, position_embeddings), - key=text_encoder_hidden_states.transpose(0, 1), - value=text_encoder_hidden_states.transpose(0, 1), - attn_mask=text_encoder_attention_mask, + key=text_encoder_hidden_states, + value=text_encoder_hidden_states, + key_padding_mask=text_encoder_attention_mask, average_attn_weights=False ) @@ -1590,12 +1590,12 @@ def __init__(self, config): def forward( self, vision_hidden_state: torch.FloatTensor, - text_hiddend_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, text_token_mask: torch.BoolTensor ) -> torch.FloatTensor: - output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2) + output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) # padding to max_text_len @@ -1867,7 +1867,7 @@ def forward( text_position_embedding=text_position_embedding, text_self_attention_masks=text_self_attention_masks, text_position_ids=text_position_ids - ) + ) if output_attentions: @@ -2488,7 +2488,7 @@ def forward( topk_coords_logits = topk_coords_logits.detach() reference_points = topk_coords_logits.sigmoid() init_reference_points = reference_points - if query_embeds: + if query_embeds is not None: target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) else: target = torch.gather( @@ -2679,6 +2679,7 @@ def forward( ) hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9] init_reference = outputs.init_reference_points if return_dict else outputs[0] inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] @@ -2692,7 +2693,11 @@ def forward( else: reference = inter_references[:, level - 1] reference = inverse_sigmoid(reference) - outputs_class = self.class_embed[level](hidden_states[:, level]) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=text_token_mask + ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: outputs_coord_logits = delta_bbox + reference From 038a63a4e49f74f958f2fa8f6761b0422377de52 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 22:37:58 -0300 Subject: [PATCH 021/252] Fixed attentions to be passed correctly --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2e9d7d3d0de7f5..edbab3773a4fcd 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2504,7 +2504,7 @@ def forward( vision_encoder_hidden_states=encoder_outputs[0], vision_encoder_attention_mask=mask_flatten, text_encoder_hidden_states=encoder_outputs[1], - text_encoder_attention_mask=text_token_mask, + text_encoder_attention_mask=~text_token_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, From e113630c5a9bcb59da14fd1793e47f56d2beb6e9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:46:17 -0300 Subject: [PATCH 022/252] Passing temperature arg when creating Sine position embedding --- .../models/grounding_dino/modeling_grounding_dino.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index edbab3773a4fcd..671092a234ee04 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -594,7 +594,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[in return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO class GroundingDINOSinePositionEmbedding(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you @@ -619,8 +618,8 @@ def forward(self, pixel_values, pixel_mask): x_embed = pixel_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 - y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) @@ -662,7 +661,7 @@ def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": # TODO find a better way of exposing other arguments - position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True) + position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True) elif config.position_embedding_type == "learned": position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) else: From 30af3a2953ff0a39cb0f0f38a575935cbbcc7aff Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:47:09 -0300 Subject: [PATCH 023/252] Removed copy comments --- .../models/grounding_dino/modeling_grounding_dino.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 671092a234ee04..000c3e1f23ff1f 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -656,7 +656,6 @@ def forward(self, pixel_values, pixel_mask=None): return pos -# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": From baad9526352c3232e0f541ee707f952bd68c4071 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:48:03 -0300 Subject: [PATCH 024/252] Added temperature argument for position embedding --- .../models/grounding_dino/configuration_grounding_dino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 3a62780362d834..e321782b197810 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -150,7 +150,8 @@ class GroundingDINOConfig(PretrainedConfig): Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. - + positional_embedding_temperature (`float`, *optional*, defaults to 20): + The temperature for Sine Positional Embedding that is used together with vision backbone. Examples: ```python @@ -227,6 +228,7 @@ def __init__( decoder_bbox_embed_share = True, two_stage_bbox_embed_share = False, two_stage_class_embed_share = False, + positional_embedding_temperature = 20, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -302,6 +304,7 @@ def __init__( if two_stage_bbox_embed_share and not decoder_bbox_embed_share: raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") self.two_stage_class_embed_share = two_stage_class_embed_share + self.positional_embedding_temperature = positional_embedding_temperature super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property From 6e37211690a8db92b4487fd356a089bb5214c6e0 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:48:36 -0300 Subject: [PATCH 025/252] Fixed typo when converting weigths to GroundingDINO vision backbone --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 15793a0df03ae7..3fe62356b8e7d9 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -84,7 +84,7 @@ def create_rename_keys(state_dict, config): f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", - f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention @@ -430,6 +430,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): position_ids=text_inputs["position_ids"], ) + print("Finished") + # if pytorch_dump_folder_path is not None: # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") # model.save_pretrained(pytorch_dump_folder_path) From 0db05e0547ee3f0d74f7aadbf97726a722e0163d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:31:38 -0300 Subject: [PATCH 026/252] Final modifications on modeling --- .../grounding_dino/modeling_grounding_dino.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 000c3e1f23ff1f..92ccdb41bab011 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1005,9 +1005,9 @@ def forward( ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output - residual = hidden_states - hidden_states = self.layer_norm_before(hidden_states) + + residual = hidden_states hidden_states = self.activation(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = self.fc2(hidden_states) @@ -1426,7 +1426,7 @@ def forward( (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, - attention_mask=key_padding_mask, + attention_mask=~key_padding_mask, position_embeddings=vision_position_embedding, reference_points=reference_points, spatial_shapes=spatial_shapes, @@ -1517,9 +1517,10 @@ def forward( residual = hidden_states # Self Attention + q = k = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, self_attn_weights = self.self_attn( - query=self.with_pos_embed(hidden_states, position_embeddings), - key=self.with_pos_embed(hidden_states, position_embeddings), + query=q, + key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False @@ -1826,9 +1827,6 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - #TODO check if this is necessary according to original implementation - vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training) - reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) encoder_vision_states = () if output_hidden_states else None From a1eba2e505247e27bdcd1499218b1226252abffb Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:41:35 -0300 Subject: [PATCH 027/252] Removed unnecessary class --- .../grounding_dino/modeling_grounding_dino.py | 119 ------------------ 1 file changed, 119 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 92ccdb41bab011..94090841784322 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -841,125 +841,6 @@ def forward( return output, attention_weights - -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO -class GroundingDINOMultiheadAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. - - Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). - """ - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - bias: bool = True, - ): - super().__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - if self.head_dim * num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {num_heads})." - ) - self.scaling = self.head_dim**-0.5 - - self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - - def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): - return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, target_len, embed_dim = hidden_states.size() - # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: - hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) - - # get queries, keys and values - query_states = self.q_proj(hidden_states) * self.scaling - key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) - value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) - - proj_shape = (batch_size * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - source_len = key_states.size(1) - - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): - raise ValueError( - f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" - f" {attn_weights.size()}" - ) - - # expand attention_mask - if attention_mask is not None: - # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] - attention_mask = _expand_mask(attention_mask, hidden_states.dtype) - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, target_len, source_len): - raise ValueError( - f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" - f" {attention_mask.size()}" - ) - attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask - attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) - attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(batch_size, target_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped - -# Repeting some code to avoid convert nn.MultiheadAttention later #TODO is this an approriate way to name this? class GroundingDINOTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" From 9cf7c3a272200aa790809e71594e372c58ef8ec2 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:42:41 -0300 Subject: [PATCH 028/252] Fixed convert structure --- .../convert_grounding_dino_to_hf.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 3fe62356b8e7d9..5dcaad277092ca 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -388,7 +388,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return tokenized_for_encoder, tokenized.attention_mask.bool() @torch.no_grad() -def convert_grounding_dino_checkpoint(model_name, checkpoint_path): +def convert_grounding_dino_checkpoint( + model_name: str, + checkpoint_path: str, + pytorch_dump_folder_path: str = None, + push_to_hub: bool = False +): #Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) @@ -420,6 +425,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): image_inputs = image_processor(image) text_inputs, text_token_mask = text_processor(text, config) + # Running forward outputs = model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], @@ -430,19 +436,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): position_ids=text_inputs["position_ids"], ) - print("Finished") + if pytorch_dump_folder_path is not None: + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) - # if pytorch_dump_folder_path is not None: - # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - # model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) - # print(f"Saving image processor to {pytorch_dump_folder_path}") - # image_processor.save_pretrained(pytorch_dump_folder_path) - - # if push_to_hub: - # print(f"Pushing model and image processor for {model_name} to hub") - # model.push_to_hub(f"microsoft/{model_name}") - # image_processor.push_to_hub(f"microsoft/{model_name}") + if push_to_hub: + print(f"Pushing model and image processor for {model_name} to hub") + model.push_to_hub(f"microsoft/{model_name}") + image_processor.push_to_hub(f"microsoft/{model_name}") if __name__ == "__main__": @@ -469,4 +473,9 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) args = parser.parse_args() - convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path) \ No newline at end of file + convert_grounding_dino_checkpoint( + args.model_name, + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.push_to_hub + ) \ No newline at end of file From 9c55b247442a99bf438927f3fa5799b225e14dd9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 24 Sep 2023 01:35:07 -0300 Subject: [PATCH 029/252] Added image processing --- .../image_processing_grounding_dino.py | 967 ++++++++++++++++++ 1 file changed, 967 insertions(+) create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py new file mode 100644 index 00000000000000..1adf8e8e0dcd62 --- /dev/null +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -0,0 +1,967 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_coco_detection_annotations, + valid_images, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotionFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (height <= width and height == size) or (width <= height and width == size): + return height, width + + if width < height: + ow = size + oh = int(size * height / width) + else: + oh = size + ow = int(size * width / height) + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + +def prepare_coco_detection_annotation( + image, + target, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GroundingDINO. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + keypoints = np.asarray(keypoints, dtype=np.float32) + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints[keep] + + return new_target + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +class GroundingDINOImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in + the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be + overridden by the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: np.ndarray, + target: Dict, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into Grounding DINO model. + """ + target = prepare_coco_detection_annotation( + image, target, input_data_format=input_data_format + ) + + return target + + def prepare(self, image, target): + logger.warning_once( + "The `prepare` method is deprecated and will be removed in a v4.33. " + "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " + "does not return the image anymore.", + ) + target = self.prepare_annotation(image, target) + return image, target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or + `height` and `width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> Dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: List[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`np.ndarray`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + padded_images = [ + self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image after resizing. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + max_size = None + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, max_size=max_size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_pad = self.do_pad if do_pad is None else do_pad + + if do_resize is not None and size is None: + raise ValueError("Size and max_size must be specified if do_resize is True.") + + if do_rescale is not None and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize is not None and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + images = make_list_of_images(images) + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if not valid_coco_detection_annotations(annotations): + raise ValueError( + "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts" + "(batch of images) with the following keys: `image_id` and `annotations`, with the latter " + "being a list of annotations in the COCO format." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + if annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + data = self.pad( + images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + data = {"pixel_values": images} + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # POSTPROCESSING METHODS - TODO: add support for other frameworks + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`GroundingDINOForObjectDetection`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`GroundingDINOForObjectDetection`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results From ae570bbdf31b249ee8d16fb4742864ec82f6aff3 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 24 Sep 2023 01:37:59 -0300 Subject: [PATCH 030/252] make fixup partially completed --- docs/source/en/tasks/object_detection.md | 2 +- src/transformers/__init__.py | 32 +- src/transformers/models/__init__.py | 2 +- .../models/auto/configuration_auto.py | 6 +- .../models/auto/feature_extraction_auto.py | 1 - .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 4 +- .../configuration_grounding_dino.py | 35 +- .../convert_grounding_dino_to_hf.py | 163 +++---- .../grounding_dino/modeling_grounding_dino.py | 405 +++++++++--------- .../processing_grounding_dino.py | 0 .../tokenization_grounding_dino.py | 0 src/transformers/utils/dummy_pt_objects.py | 48 +-- utils/check_repo.py | 1 + 14 files changed, 347 insertions(+), 354 deletions(-) create mode 100644 src/transformers/models/grounding_dino/processing_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 4eab9e58fb27da..38498417c6fb77 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index aa2f7837b4ce67..4ea2c3ace121ea 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -274,7 +274,6 @@ "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"], "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"], "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"], - "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], "models.deprecated": [], "models.deprecated.bort": [], @@ -358,6 +357,7 @@ "GPTSanJapaneseTokenizer", ], "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroupViTConfig", @@ -1542,14 +1542,6 @@ "DeformableDetrPreTrainedModel", ] ) - _import_structure["models.grounding_dino"].extend( - [ - "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", - "GroundingDINOForObjectDetection", - "GroundingDINOModel", - "GroundingDINOPreTrainedModel", - ] - ) _import_structure["models.deit"].extend( [ "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -1921,6 +1913,14 @@ "GraphormerPreTrainedModel", ] ) + _import_structure["models.grounding_dino"].extend( + [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + ) _import_structure["models.groupvit"].extend( [ "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4338,7 +4338,6 @@ DecisionTransformerConfig, ) from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig - from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig from .models.deprecated.mctct import ( MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -4414,6 +4413,7 @@ GPTSanJapaneseTokenizer, ) from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GroupViTConfig, @@ -5445,12 +5445,6 @@ DeformableDetrModel, DeformableDetrPreTrainedModel, ) - from .models.grounding_dino import ( - GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, - GroundingDINOForObjectDetection, - GroundingDINOModel, - GroundingDINOPreTrainedModel, - ) from .models.deit import ( DEIT_PRETRAINED_MODEL_ARCHIVE_LIST, DeiTForImageClassification, @@ -5753,6 +5747,12 @@ GraphormerModel, GraphormerPreTrainedModel, ) + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) from .models.groupvit import ( GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, GroupViTModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 376f9353608e56..32e022f6d1d873 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -59,7 +59,6 @@ deberta_v2, decision_transformer, deformable_detr, - grounding_dino, deit, deprecated, deta, @@ -98,6 +97,7 @@ gptj, gptsan_japanese, graphormer, + grounding_dino, groupvit, herbert, hubert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index db5e5f86761b88..c60f2bd5aa0256 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -71,7 +71,6 @@ ("deberta-v2", "DebertaV2Config"), ("decision_transformer", "DecisionTransformerConfig"), ("deformable_detr", "DeformableDetrConfig"), - ("grounding-dino", "GroundingDINOConfig"), ("deit", "DeiTConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), @@ -107,6 +106,7 @@ ("gptj", "GPTJConfig"), ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), + ("grounding-dino", "GroundingDINOConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), @@ -278,7 +278,6 @@ ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -312,6 +311,7 @@ ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -476,7 +476,6 @@ ("deberta-v2", "DeBERTa-v2"), ("decision_transformer", "Decision Transformer"), ("deformable_detr", "Deformable DETR"), - ("grounding-dino", "Grounding DINO"), ("deit", "DeiT"), ("deplot", "DePlot"), ("deta", "DETA"), @@ -517,6 +516,7 @@ ("gptj", "GPT-J"), ("gptsan-japanese", "GPTSAN-japanese"), ("graphormer", "Graphormer"), + ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), ("hubert", "Hubert"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 78a0686c4816b0..90ece37c657191 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -50,7 +50,6 @@ ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("data2vec-vision", "BeitFeatureExtractor"), ("deformable_detr", "DeformableDetrFeatureExtractor"), - ("grounding-dino", "GroundingDINOFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), ("dinat", "ViTFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index ec8bf20938fd7a..d6d722b3e0842b 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -53,7 +53,6 @@ ("cvt", "ConvNextImageProcessor"), ("data2vec-vision", "BeitImageProcessor"), ("deformable_detr", "DeformableDetrImageProcessor"), - ("grounding-dino", "GroundingDINOImageProcessor"), ("deit", "DeiTImageProcessor"), ("deta", "DetaImageProcessor"), ("detr", "DetrImageProcessor"), @@ -67,6 +66,7 @@ ("focalnet", "BitImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), + ("grounding-dino", "GroundingDINOImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2c54349e8306b2..abfa4f0e50328c 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -69,7 +69,6 @@ ("deberta-v2", "DebertaV2Model"), ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), - ("grounding-dino", "GroundingDINOModel"), ("deit", "DeiTModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), @@ -104,6 +103,7 @@ ("gptj", "GPTJModel"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), + ("grounding-dino", "GroundingDINOModel"), ("groupvit", "GroupViTModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), @@ -620,9 +620,9 @@ # Model for Object Detection mapping ("conditional_detr", "ConditionalDetrForObjectDetection"), ("deformable_detr", "DeformableDetrForObjectDetection"), - ("grounding-dino", "GroundingDINOForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("grounding-dino", "GroundingDINOForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), ("yolos", "YolosForObjectDetection"), ] diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e321782b197810..09b9c41f131964 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -26,11 +26,10 @@ } - class GroundingDINOConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate - a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a + Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Grounding DINO [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. @@ -147,9 +146,11 @@ class GroundingDINOConfig(PretrainedConfig): decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): Whether to share the bbox embedding between the decoder and the two-stage bbox generator. two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): - Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal + generation. two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): - Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. + Whether to share the class embedding between the two-stage bbox generator and the region proposal + generation. positional_embedding_temperature (`float`, *optional*, defaults to 20): The temperature for Sine Positional Embedding that is used together with vision backbone. Examples: @@ -217,18 +218,18 @@ def __init__( eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, - #other parameters - max_text_len = 256, - sub_sentence_present = True, - text_enhancer_dropout = 0.0, - fusion_droppath = 0.1, - fusion_dropout = 0.0, - embedding_init_target = True, - query_dim = 4, - decoder_bbox_embed_share = True, - two_stage_bbox_embed_share = False, - two_stage_class_embed_share = False, - positional_embedding_temperature = 20, + # other parameters + max_text_len=256, + sub_sentence_present=True, + text_enhancer_dropout=0.0, + fusion_droppath=0.1, + fusion_dropout=0.0, + embedding_init_target=True, + query_dim=4, + decoder_bbox_embed_share=True, + two_stage_bbox_embed_share=False, + two_stage_class_embed_share=False, + positional_embedding_temperature=20, **kwargs, ): if backbone_config is not None and use_timm_backbone: diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 5dcaad277092ca..4f2f3716329ed4 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -14,7 +14,8 @@ # limitations under the License. """Convert GroundingDINO SimMIM checkpoints from the original repository. -URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" +URL: +https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" import argparse @@ -22,11 +23,9 @@ import torch from PIL import Image from torchvision import transforms as T -import torchvision.transforms.functional as F -from transformers import ( - GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer -) +from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection + IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] @@ -66,64 +65,64 @@ def create_rename_keys(state_dict, config): #TODO names might change after modifing GroundingDINOModel class ########################################## VISION BACKBONE - START # patch embedding layer - rename_keys.append(("backbone.0.patch_embed.proj.weight", + rename_keys.append(("backbone.0.patch_embed.proj.weight", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.patch_embed.proj.bias", + rename_keys.append(("backbone.0.patch_embed.proj.bias", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.patch_embed.norm.weight", + rename_keys.append(("backbone.0.patch_embed.norm.weight", "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.patch_embed.norm.bias", + rename_keys.append(("backbone.0.patch_embed.norm.bias", "model.backbone.conv_encoder.model.embeddings.norm.bias")) for layer, depth in enumerate(config.backbone_config.depths): for block in range(depth): # layernorms - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) # intermidiate - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) - + # output - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) - + # downsample if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) - + for out_indice in config.backbone_config.out_indices: # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", + rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", + rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) - + ########################################## VISION BACKBONE - END ########################################## ENCODER - START @@ -182,15 +181,15 @@ def create_rename_keys(state_dict, config): for layer in range(config.encoder_layers): # deformable for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # text enhance for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # fusion layers for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) ########################################## ENCODER - END @@ -230,7 +229,7 @@ def create_rename_keys(state_dict, config): target_prefix_decoder = f'model.decoder.layers.{layer_num}.' for source_name, target_name in key_mappings_decoder.items(): - rename_keys.append((source_prefix_decoder + source_name, + rename_keys.append((source_prefix_decoder + source_name, target_prefix_decoder + target_name)) ########################################## DECODER - END @@ -240,7 +239,7 @@ def create_rename_keys(state_dict, config): ########################################## Additional - START for layer_name, params in state_dict.items(): - #### TEXT BACKBONE + #### TEXT BACKBONE if "bert" in layer_name: rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE @@ -251,19 +250,19 @@ def create_rename_keys(state_dict, config): rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text"))) #### DECODER REFERENCE POINT HEAD if "transformer.decoder.ref_point_head" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", "model.decoder.reference_points_head"))) #### DECODER BBOX EMBED if "transformer.decoder.bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", "model.decoder.bbox_embed"))) if "transformer.enc_output" in layer_name: rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) - + if "transformer.enc_out_bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", + rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", "model.encoder_output_bbox_embed"))) - + rename_keys.append(("transformer.level_embed", "model.level_embed")) rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) @@ -273,10 +272,12 @@ def create_rename_keys(state_dict, config): # fmt: on return rename_keys + def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val + # we split up the matrix of each encoder layer into queries, keys and values def read_in_q_k_v(state_dict, config): ########################################## VISION BACKBONE - START @@ -288,14 +289,26 @@ def read_in_q_k_v(state_dict, config): in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] - - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" + ] = in_proj_weight[:hidden_size, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" + ] = in_proj_bias[:hidden_size] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" + ] = in_proj_weight[hidden_size : hidden_size * 2, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" + ] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" + ] = in_proj_weight[-hidden_size:, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" + ] = in_proj_bias[-hidden_size:] ########################################## VISION BACKBONE - END @@ -305,12 +318,14 @@ def prepare_img(): image = Image.open(requests.get(url, stream=True).raw).convert("RGB") return image + def text_processor(text: str, config): def preprocess_caption(caption: str) -> str: result = caption.lower().strip() if result.endswith("."): return result return result + "." + def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: """Generate attention mask between each pair of special tokens Args: @@ -330,9 +345,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token idxs = torch.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = ( - torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) - ) + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) position_ids = torch.zeros((bs, num_token), device=input_ids.device) cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 @@ -352,8 +365,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token previous_col = col cate_to_token_mask_list = [ - torch.stack(cate_to_token_mask_listi, dim=0) - for cate_to_token_mask_listi in cate_to_token_mask_list + torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list ] # # padding mask @@ -361,23 +373,23 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() return attention_mask, position_ids.to(torch.long) + tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( - tokenized, special_tokens) - + tokenized, special_tokens + ) + max_text_len = config.max_text_len sub_sentence_present = config.sub_sentence_present if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[ - :, : max_text_len, : max_text_len - ] - position_ids = position_ids[:, : max_text_len] - tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len] + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] # extract text embeddings if sub_sentence_present: @@ -387,14 +399,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return tokenized_for_encoder, tokenized.attention_mask.bool() + @torch.no_grad() def convert_grounding_dino_checkpoint( - model_name: str, - checkpoint_path: str, - pytorch_dump_folder_path: str = None, - push_to_hub: bool = False + model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False ): - #Define default GroundingDINO configuation + # Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) # Load original checkpoint @@ -403,7 +413,7 @@ def convert_grounding_dino_checkpoint( # Rename keys new_state_dict = original_state_dict.copy() rename_keys = create_rename_keys(original_state_dict, config) - + for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) @@ -416,17 +426,13 @@ def convert_grounding_dino_checkpoint( image = prepare_img() text = "a cat" image_processor = T.Compose( - [ - T.Resize(size=800, max_size=1333), - T.ToTensor(), - T.Normalize(IMAGENET_MEAN, IMAGENET_STD) - ] + [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)] ) image_inputs = image_processor(image) text_inputs, text_token_mask = text_processor(text, config) # Running forward - outputs = model( + model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"], @@ -474,8 +480,5 @@ def convert_grounding_dino_checkpoint( args = parser.parse_args() convert_grounding_dino_checkpoint( - args.model_name, - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.push_to_hub - ) \ No newline at end of file + args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub + ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 94090841784322..69264d51b5e6b0 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -40,13 +40,11 @@ requires_backends, ) from ...modeling_outputs import ( - BaseModelOutput, - BaseModelOutputWithPoolingAndCrossAttentions, - BaseModelOutputWithPastAndCrossAttentions + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...pytorch_utils import meshgrid +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone from .configuration_grounding_dino import GroundingDINOConfig @@ -135,7 +133,6 @@ def backward(context, grad_output): ] - @dataclass class GroundingDINODecoderOutput(ModelOutput): """ @@ -177,11 +174,11 @@ class GroundingDINODecoderOutput(ModelOutput): vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + @dataclass class GroundingDINOEncoderOutput(ModelOutput): """ - Base class for outputs of the GroundingDINOEncoder. This class extends - BaseModelOutput, due to: + Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to: - vision and text last hidden states - vision and text intermediate hidden states - vision and text attentions @@ -193,30 +190,31 @@ class GroundingDINOEncoderOutput(ModelOutput): last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the text encoder. hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer - plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer - plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. """ + last_hidden_state_vision: torch.FloatTensor = None last_hidden_state_text: torch.FloatTensor = None hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None @@ -262,29 +260,29 @@ class GroundingDINOModelOutput(ModelOutput): encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. @@ -359,29 +357,29 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -618,8 +616,8 @@ def forward(self, pixel_values, pixel_mask): x_embed = pixel_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) @@ -660,7 +658,9 @@ def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": # TODO find a better way of exposing other arguments - position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True) + position_embedding = GroundingDINOSinePositionEmbedding( + n_steps, config.positional_embedding_temperature, normalize=True + ) elif config.position_embedding_type == "learned": position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) else: @@ -841,17 +841,19 @@ def forward( return output, attention_weights -#TODO is this an approriate way to name this? + +# TODO is this an approriate way to name this? class GroundingDINOTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" + def __init__(self, config): super().__init__() self.self_attn = nn.MultiheadAttention( - embed_dim=config.d_model, - num_heads=config.encoder_attention_heads // 2, + embed_dim=config.d_model, + num_heads=config.encoder_attention_heads // 2, dropout=config.text_enhancer_dropout, batch_first=True, - ) + ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) @@ -871,18 +873,14 @@ def forward( hidden_states: Tensor, attention_masks: Optional[Tensor] = None, position_embeddings: Optional[Tensor] = None, - ): # repeat attn mask + ): # repeat attn mask if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) attention_output, attention_weights = self.self_attn( - query=q, - key=k, - value=hidden_states, - attn_mask=attention_masks, - average_attn_weights=False + query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output @@ -897,16 +895,10 @@ def forward( hidden_states = self.layer_norm_after(hidden_states) return hidden_states, attention_weights - + + class GroundingDINOBiMultiHeadAttention(nn.Module): - def __init__( - self, - vision_dim: int, - text_dim: int, - embed_dim: int, - num_heads: int, - dropout:float = 0.1 - ): + def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1): super().__init__() self.embed_dim = embed_dim @@ -949,12 +941,12 @@ def _reset_parameters(self): self.out_text_proj.bias.data.fill_(0) def forward( - self, - vision_features: Tensor, - text_features: Tensor, - vision_attention_mask: Optional[Tensor] = None, - text_attention_mask: Optional[Tensor] = None - ): + self, + vision_features: Tensor, + text_features: Tensor, + vision_attention_mask: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + ): """_summary_ Args: @@ -1000,21 +992,21 @@ def forward( attn_weights = attn_weights - attn_weights.max() attn_weights = torch.clamp( - attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range attn_weights = torch.clamp( - attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose(1, 2) text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] - + text_attn_weights = torch.clamp( - text_attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range + text_attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range text_attn_weights = torch.clamp( - text_attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + text_attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range # mask vison for language if vision_attention_mask is not None: @@ -1027,9 +1019,7 @@ def forward( # mask language for vision if text_attention_mask is not None: - text_attention_mask = ( - text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) - ) + text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) attn_weights.masked_fill_(text_attention_mask, float("-inf")) vision_attn_weights = attn_weights.softmax(dim=-1) @@ -1062,6 +1052,7 @@ def forward( return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) + # Copied from transformers.models.beit.modeling_beit.drop_path def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: """ @@ -1082,6 +1073,7 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals output = input.div(keep_prob) * random_tensor return output + # Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO class GroundingDINODropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" @@ -1095,6 +1087,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) + + class GroundingDINOFusionLayer(nn.Module): def __init__(self, config, init_values=1e-4): super().__init__() @@ -1104,11 +1098,11 @@ def __init__(self, config, init_values=1e-4): self.layer_norm_vision = nn.LayerNorm(config.d_model) self.layer_norm_text = nn.LayerNorm(config.d_model) self.attn = GroundingDINOBiMultiHeadAttention( - vision_dim=config.d_model, - text_dim=config.d_model, - embed_dim=config.encoder_ffn_dim // 2, - num_heads=config.encoder_attention_heads // 2, - dropout=config.fusion_dropout + vision_dim=config.d_model, + text_dim=config.d_model, + embed_dim=config.encoder_ffn_dim // 2, + num_heads=config.encoder_attention_heads // 2, + dropout=config.fusion_dropout, ) # add layer scale for training stability @@ -1120,17 +1114,18 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at vision_features = self.layer_norm_vision(vision_features) text_features = self.layer_norm_text(text_features) (delta_v, vision_attn), (delta_t, text_attn) = self.attn( - vision_features, - text_features, - vision_attention_mask=attention_mask_vision, - text_attention_mask=attention_mask_text + vision_features, + text_features, + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text, ) vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) text_features = text_features + self.drop_path(self.gamma_l * delta_t) return (vision_features, vision_attn), (text_features, text_attn) -#NOTE just renamed the class + +# NOTE just renamed the class class GroundingDINODeformableLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() @@ -1210,12 +1205,13 @@ def forward( return hidden_states, attn_weights + def get_sine_pos_embed( pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True, - ) -> Tensor: +) -> Tensor: """generate sine position embedding from a position tensor Args: pos_tensor (torch.Tensor): shape: [..., n]. @@ -1250,26 +1246,19 @@ def __init__(self, config) -> None: self.deformable_layer = GroundingDINODeformableLayer(config) def get_text_position_embeddings( - self, - text_features: Tensor, - text_position_embedding: Tensor, - text_position_ids: Tensor - ) -> Tensor: + self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor + ) -> Tensor: bs, n_text, text_dim = text_features.shape if text_position_embedding is None and text_position_ids is None: text_position_embedding = ( - torch.arange(n_text, device=text_features.device) - .float() - .unsqueeze(0) - .unsqueeze(-1) - .repeat(bs, 1, 1) + torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1) ) text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) if text_position_ids is not None: text_position_embedding = get_sine_pos_embed( text_position_ids[..., None], num_pos_feats=256, exchange_xy=False ) - + return text_position_embedding def forward( @@ -1284,12 +1273,10 @@ def forward( text_attention_mask: Optional[Tensor] = None, text_position_embedding: Optional[Tensor] = None, text_self_attention_masks: Optional[Tensor] = None, - text_position_ids: Optional[Tensor] = None + text_position_ids: Optional[Tensor] = None, ): text_position_embedding = self.get_text_position_embeddings( - text_features, - text_position_embedding, - text_position_ids + text_features, text_position_embedding, text_position_ids ) (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( @@ -1302,7 +1289,7 @@ def forward( (text_features, text_enhanced_attn) = self.text_enhancer_layer( hidden_states=text_features, attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=(text_position_embedding if text_position_embedding is not None else None) + position_embeddings=(text_position_embedding if text_position_embedding is not None else None), ) (vision_features, vision_deformable_attn) = self.deformable_layer( @@ -1315,8 +1302,8 @@ def forward( ) return ( - (vision_features, text_features), - (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn) + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn), ) @@ -1330,7 +1317,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - batch_first=True + batch_first=True, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] @@ -1342,7 +1329,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - batch_first=True + batch_first=True, ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention @@ -1400,11 +1387,7 @@ def forward( # Self Attention q = k = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, self_attn_weights = self.self_attn( - query=q, - key=k, - value=hidden_states, - attn_mask=self_attn_mask, - average_attn_weights=False + query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1419,7 +1402,7 @@ def forward( key=text_encoder_hidden_states, value=text_encoder_hidden_states, key_padding_mask=text_encoder_attention_mask, - average_attn_weights=False + average_attn_weights=False, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1462,19 +1445,18 @@ def forward( return outputs + class GroundingDINOContrastiveEmbedding(nn.Module): def __init__(self, config): super().__init__() self.max_text_len = config.max_text_len def forward( - self, - vision_hidden_state: torch.FloatTensor, - text_hidden_state: torch.FloatTensor, - text_token_mask: torch.BoolTensor - ) -> torch.FloatTensor: - - + self, + vision_hidden_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor, + ) -> torch.FloatTensor: output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) @@ -1484,6 +1466,7 @@ def forward( return new_output + # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead class GroundingDINOClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" @@ -1503,30 +1486,29 @@ def forward(self, hidden_states: torch.Tensor): return hidden_states -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO class GroundingDINOPreTrainedModel(PreTrainedModel): config_class = GroundingDINOConfig base_model_prefix = "model" main_input_name = "pixel_values" def _init_weights(self, module): - std = self.config.init_std - if isinstance(module, GroundingDINOLearnedPositionEmbedding): nn.init.uniform_(module.row_embeddings.weight) nn.init.uniform_(module.column_embeddings.weight) elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): module._reset_parameters() - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GroundingDINOBiMultiHeadAttention): + module._reset_parameters() + elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + elif isinstance(module, GroundingDINOModel): + nn.init.constant_(module.input_proj_text.bias.data, 0) + nn.init.xavier_uniform_(module.input_proj_text.weight.data) + for proj in module.input_proj_vision: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) if hasattr(module, "reference_points") and not self.config.two_stage: nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) nn.init.constant_(module.reference_points.bias.data, 0.0) @@ -1743,9 +1725,8 @@ def forward( text_attention_mask=text_attention_mask, text_position_embedding=text_position_embedding, text_self_attention_masks=text_self_attention_masks, - text_position_ids=text_position_ids - ) - + text_position_ids=text_position_ids, + ) if output_attentions: all_attn_fused_vision += (attentions[0],) @@ -1759,9 +1740,12 @@ def forward( if not return_dict: enc_outputs = [ - vision_features, text_features, - all_attn_fused_vision, all_attn_fused_text, - all_attn_enhanced_text, all_attn_deformable + vision_features, + text_features, + all_attn_fused_vision, + all_attn_fused_text, + all_attn_enhanced_text, + all_attn_deformable, ] return tuple(v for v in enc_outputs if v is not None) return GroundingDINOEncoderOutput( @@ -1772,9 +1756,10 @@ def forward( cross_attentions_vision=all_attn_fused_vision, cross_attentions_text=all_attn_fused_text, attentions_vision=all_attn_deformable, - attentions_text=all_attn_enhanced_text + attentions_text=all_attn_enhanced_text, ) + class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. @@ -1797,10 +1782,7 @@ def __init__(self, config: GroundingDINOConfig): self.layer_norm = nn.LayerNorm(config.d_model) self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.reference_points_head = GroundingDINOMLPPredictionHead( - config.query_dim // 2 * config.d_model, - config.d_model, - config.d_model, - 2 + config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 ) self.gradient_checkpointing = False @@ -1826,7 +1808,7 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen # batch_size, num_queries, num_pos_feats pos_x = pos_x[:, :, None] / dim_t pos_y = pos_y[:, :, None] / dim_t - # batch_size, num_queries, num_pos_feats + # batch_size, num_queries, num_pos_feats pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) @@ -1849,8 +1831,6 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1))) return pos - - def forward( self, inputs_embeds, @@ -1959,7 +1939,7 @@ def custom_forward(*inputs): text_encoder_hidden_states=text_encoder_hidden_states, text_encoder_attention_mask=text_encoder_attention_mask, self_attn_mask=self_attn_mask, - output_attentions=output_attentions + output_attentions=output_attentions, ) hidden_states = layer_outputs[0] @@ -1992,7 +1972,6 @@ def custom_forward(*inputs): if vision_encoder_hidden_states is not None: all_cross_attns_vision += (layer_outputs[3],) - # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) @@ -2012,7 +1991,7 @@ def custom_forward(*inputs): all_hidden_states, all_self_attns, all_cross_attns_vision, - all_cross_attns_text + all_cross_attns_text, ] if v is not None ) @@ -2023,7 +2002,7 @@ def custom_forward(*inputs): hidden_states=all_hidden_states, attentions=all_self_attns, vision_cross_attentions=all_cross_attns_vision, - text_cross_attentions=all_cross_attns_text + text_cross_attentions=all_cross_attns_text, ) @@ -2075,7 +2054,7 @@ def __init__(self, config: GroundingDINOConfig): ) # Create text backbone - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) + self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config) self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: @@ -2199,7 +2178,7 @@ def forward( text_token_mask: Tensor, text_self_attention_masks: Tensor, position_ids: Tensor, - pixel_mask: Optional[Tensor]=None, + pixel_mask: Optional[Tensor] = None, encoder_outputs=None, output_attentions=None, output_hidden_states=None, @@ -2236,7 +2215,9 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Extract text features from text backbone - text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"] + text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[ + "last_hidden_state" + ] text_features = self.input_proj_text(text_features) batch_size, num_channels, height, width = pixel_values.shape @@ -2319,7 +2300,7 @@ def forward( text_position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict + return_dict=return_dict, ) # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput): @@ -2346,9 +2327,7 @@ def forward( # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) enc_outputs_class = self.encoder_output_class_embed( - object_query_embedding, - encoder_outputs[1], - text_token_mask + object_query_embedding, encoder_outputs[1], text_token_mask ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) @@ -2389,7 +2368,7 @@ def forward( self_attn_mask=None, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict + return_dict=return_dict, ) if not return_dict: @@ -2422,8 +2401,8 @@ def forward( @add_start_docstrings( """ - Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on - top, for tasks such as COCO detection. + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. """, GROUNDING_DINO_START_DOCSTRING, ) @@ -2446,13 +2425,12 @@ def __init__(self, config: GroundingDINOConfig): nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) - if config.decoder_bbox_embed_share: self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) else: self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) - # hack implementation for two-stage + # hack implementation for two-stage self.model.decoder.bbox_embed = self.bbox_embed self.model.decoder.class_embed = self.class_embed @@ -2461,8 +2439,8 @@ def __init__(self, config: GroundingDINOConfig): self.model.encoder_output_bbox_embed = _bbox_embed else: self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed) - - #TODO don't believe this is necessary since class_embed has no parameters + + # TODO don't believe this is necessary since class_embed has no parameters if config.two_stage_class_embed_share: self.model.encoder_output_class_embed = _class_embed else: @@ -2490,12 +2468,12 @@ def forward( text_token_mask: torch.BoolTensor, text_self_attention_masks: torch.BoolTensor, position_ids: torch.LongTensor, - pixel_mask: Optional[torch.BoolTensor]=None, - encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None, - labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, ): r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2541,14 +2519,14 @@ def forward( # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( - pixel_values=pixel_values , - input_ids=input_ids , - attention_mask=attention_mask , - token_type_ids=token_type_ids , - text_token_mask=text_token_mask , - text_self_attention_masks=text_self_attention_masks , - position_ids=position_ids , - pixel_mask=pixel_mask , + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + position_ids=position_ids, + pixel_mask=pixel_mask, encoder_outputs=encoder_outputs, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -2573,8 +2551,8 @@ def forward( outputs_class = self.class_embed[level]( vision_hidden_state=hidden_states[:, level], text_hidden_state=enc_text_hidden_state, - text_token_mask=text_token_mask - ) + text_token_mask=text_token_mask, + ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: outputs_coord_logits = delta_bbox + reference @@ -3117,6 +3095,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): raise ValueError("Only 3-dimensional tensors are supported") return NestedTensor(tensor, mask) + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText class GroundingDINOTextEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" @@ -3181,8 +3160,10 @@ def forward( embeddings = self.dropout(embeddings) return embeddings + # Classes for Text Backbone (It's just a BERT model) + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText class GroundingDINOTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3317,6 +3298,7 @@ def forward( outputs = outputs + (past_key_value,) return outputs + # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText class GroundingDINOTextSelfOutput(nn.Module): def __init__(self, config): @@ -3331,6 +3313,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText class GroundingDINOTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3380,6 +3363,7 @@ def forward( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText class GroundingDINOTextIntermediate(nn.Module): def __init__(self, config): @@ -3395,6 +3379,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText class GroundingDINOTextOutput(nn.Module): def __init__(self, config): @@ -3409,6 +3394,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText class GroundingDINOTextLayer(nn.Module): def __init__(self, config): @@ -3495,6 +3481,7 @@ def feed_forward_chunk(self, attention_output): layer_output = self.output(intermediate_output, attention_output) return layer_output + # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText class GroundingDINOTextEncoder(nn.Module): def __init__(self, config): @@ -3593,6 +3580,7 @@ def custom_forward(*inputs): cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText class GroundingDINOTextPooler(nn.Module): def __init__(self, config): @@ -3608,7 +3596,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: pooled_output = self.activation(pooled_output) return pooled_output -class GroundingDINOTextModel(PreTrainedModel): + +class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index f0bc1e774383b5..a36b75ce60a657 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2340,30 +2340,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None - - -class GroundingDINOForObjectDetection(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class GroundingDINOModel(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class GroundingDINOPreTrainedModel(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None @@ -3852,6 +3828,30 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GroundingDINOForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/utils/check_repo.py b/utils/check_repo.py index 98f2436ae3af45..8600226c8205eb 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -220,6 +220,7 @@ "FlavaMultimodalModel", "GPT2DoubleHeadsModel", "GPTSw3DoubleHeadsModel", + "GroundingDINOTextPrenet", "InstructBlipVisionModel", "InstructBlipQFormerModel", "LayoutLMForQuestionAnswering", From 1f6475f7c002ec44f6abc74d0abd08b150ecbf71 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 13:45:13 -0300 Subject: [PATCH 031/252] Now text_backbone_config has its own class --- .../configuration_grounding_dino.py | 119 ++++++++++++++++-- 1 file changed, 111 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 09b9c41f131964..a3aa2b733d0474 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -25,6 +25,115 @@ "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } +# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet +class GroundingDINOTextPrenetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a + [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or + [`TFGroundingDINOTextPrenetModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = GroundingDINOTextPrenetConfig() + + >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration + >>> model = GroundingDINOTextPrenetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "grounding-dino-text-prenet" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + class GroundingDINOConfig(PretrainedConfig): r""" @@ -177,7 +286,7 @@ def __init__( self, use_timm_backbone=False, backbone_config={"model_type": "swin"}, - text_backbone_config="bert-base-uncased", + text_backbone_config=None, num_channels=3, num_queries=900, max_position_embeddings=1024, @@ -187,15 +296,12 @@ def __init__( decoder_layers=6, decoder_ffn_dim=2048, decoder_attention_heads=8, - encoder_layerdrop=0.0, is_encoder_decoder=True, activation_function="relu", d_model=256, dropout=0.1, attention_dropout=0.0, activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", @@ -259,9 +365,6 @@ def __init__( self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop self.auxiliary_loss = auxiliary_loss self.position_embedding_type = position_embedding_type self.backbone = backbone @@ -289,7 +392,7 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) + self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config self.max_text_len = max_text_len self.sub_sentence_present = sub_sentence_present # Text Enhancer From d763e0413bc8885f9421aaf8aab1f873079a876e Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 13:47:56 -0300 Subject: [PATCH 032/252] Modified convert script --- .../convert_grounding_dino_to_hf.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 4f2f3716329ed4..29ad93f70ab536 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -374,7 +374,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return attention_mask, position_ids.to(torch.long) - tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") @@ -401,12 +401,21 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token @torch.no_grad() -def convert_grounding_dino_checkpoint( - model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False -): +def convert_grounding_dino_checkpoint(args): + + model_name = args.model_name + pytorch_dump_folder_path = args.pytorch_dump_folder_path + push_to_hub = args.push_to_hub + + checkpoint_mapping = { + "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", + "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", + } # Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) + checkpoint_path = checkpoint_mapping[model_name] + # Load original checkpoint original_state_dict = torch.load(checkpoint_path, map_location="cpu") @@ -432,7 +441,7 @@ def convert_grounding_dino_checkpoint( text_inputs, text_token_mask = text_processor(text, config) # Running forward - model( + output = model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"], @@ -451,8 +460,11 @@ def convert_grounding_dino_checkpoint( if push_to_hub: print(f"Pushing model and image processor for {model_name} to hub") - model.push_to_hub(f"microsoft/{model_name}") - image_processor.push_to_hub(f"microsoft/{model_name}") + model.push_to_hub(f"EduardoPacheco/{model_name}") + #TODO push image processor to hub + # image_processor.push_to_hub(f"microsoft/{model_name}") + #TODO push tokenizer to hub + #TODO push processor to hub if __name__ == "__main__": @@ -460,17 +472,17 @@ def convert_grounding_dino_checkpoint( # Required parameters parser.add_argument( "--model_name", - default="grounding-dino-tiny", + default="grounding-dino-base", type=str, choices=["grounding-dino-tiny", "grounding-dino-base"], help="Name of the GroundingDINO model you'd like to convert.", ) - parser.add_argument( - "--checkpoint_path", - default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", - type=str, - help="Path to the original PyTorch checkpoint (.pth file).", - ) + # parser.add_argument( + # "--checkpoint_path", + # default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", + # type=str, + # help="Path to the original PyTorch checkpoint (.pth file).", + # ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." ) @@ -479,6 +491,4 @@ def convert_grounding_dino_checkpoint( ) args = parser.parse_args() - convert_grounding_dino_checkpoint( - args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub - ) + convert_grounding_dino_checkpoint(args) From 04022d4aa398501f692437a1cbb1a4a48dc2bcab Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 15:01:44 -0300 Subject: [PATCH 033/252] Removed unnecessary config attribute --- .../configuration_grounding_dino.py | 2 -- .../convert_grounding_dino_to_hf.py | 21 ++++--------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index a3aa2b733d0474..fbd0d483b48e45 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -326,7 +326,6 @@ def __init__( disable_custom_kernels=False, # other parameters max_text_len=256, - sub_sentence_present=True, text_enhancer_dropout=0.0, fusion_droppath=0.1, fusion_dropout=0.0, @@ -394,7 +393,6 @@ def __init__( # Text backbone self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config self.max_text_len = max_text_len - self.sub_sentence_present = sub_sentence_present # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout # Fusion diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 29ad93f70ab536..ed16da3f0c4617 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -347,7 +347,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token # generate attention mask and positional ids attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) position_ids = torch.zeros((bs, num_token), device=input_ids.device) - cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 for i in range(idxs.shape[0]): row, col = idxs[i] @@ -359,18 +358,8 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token position_ids[row, previous_col + 1 : col + 1] = torch.arange( 0, col - previous_col, device=input_ids.device ) - c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() - c2t_maski[previous_col + 1 : col] = True - cate_to_token_mask_list[row].append(c2t_maski) - previous_col = col - - cate_to_token_mask_list = [ - torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list - ] - # # padding mask - # padding_mask = tokenized['attention_mask'] - # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + previous_col = col return attention_mask, position_ids.to(torch.long) @@ -383,7 +372,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token ) max_text_len = config.max_text_len - sub_sentence_present = config.sub_sentence_present if text_self_attention_masks.shape[1] > max_text_len: text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] position_ids = position_ids[:, :max_text_len] @@ -392,10 +380,9 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] # extract text embeddings - if sub_sentence_present: - tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} - tokenized_for_encoder["attention_mask"] = text_self_attention_masks - tokenized_for_encoder["position_ids"] = position_ids + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids return tokenized_for_encoder, tokenized.attention_mask.bool() From 938f805a92a8a4c73aebce5938db0067736cda4f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:06:12 -0300 Subject: [PATCH 034/252] Added new function to generate sub sentence mask --- .../grounding_dino/modeling_grounding_dino.py | 76 +++++++++++++++---- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 69264d51b5e6b0..d75db4735ad30a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -47,7 +47,7 @@ from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone -from .configuration_grounding_dino import GroundingDINOConfig +from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig from .load_custom import load_cuda_kernels @@ -1923,9 +1923,16 @@ def custom_forward(*inputs): layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, + query_pos, + reference_points_input, + spatial_shapes, + level_start_index, vision_encoder_hidden_states, vision_encoder_attention_mask, - None, + text_encoder_hidden_states, + text_encoder_attention_mask, + self_attn_mask, + None ) else: layer_outputs = decoder_layer( @@ -2005,6 +2012,42 @@ def custom_forward(*inputs): text_cross_attentions=all_cross_attns_text, ) +SPECIAL_TOKENS = [101, 102, 1012, 1029] +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (torch.LongTensor): input ids. Shape: [bs, num_token] + Returns: + Tuple[torch.Tensor]: attention mask between each special tokens and position_ids + """ + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + return attention_mask, position_ids.to(torch.long) + @add_start_docstrings( """ @@ -2173,11 +2216,8 @@ def forward( self, pixel_values: Tensor, input_ids: Tensor, - attention_mask: Tensor, token_type_ids: Tensor, - text_token_mask: Tensor, - text_self_attention_masks: Tensor, - position_ids: Tensor, + attention_mask: Tensor, pixel_mask: Optional[Tensor] = None, encoder_outputs=None, output_attentions=None, @@ -2214,8 +2254,19 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] + # Extract text features from text backbone - text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[ + text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[ "last_hidden_state" ] text_features = self.input_proj_text(text_features) @@ -2463,11 +2514,8 @@ def forward( self, pixel_values: torch.FloatTensor, input_ids: torch.LongTensor, - attention_mask: torch.BoolTensor, + attention_mask: torch.LongTensor, token_type_ids: torch.LongTensor, - text_token_mask: torch.BoolTensor, - text_self_attention_masks: torch.BoolTensor, - position_ids: torch.LongTensor, pixel_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, @@ -2523,9 +2571,6 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, - text_token_mask=text_token_mask, - text_self_attention_masks=text_self_attention_masks, - position_ids=position_ids, pixel_mask=pixel_mask, encoder_outputs=encoder_outputs, output_attentions=output_attentions, @@ -2551,7 +2596,7 @@ def forward( outputs_class = self.class_embed[level]( vision_hidden_state=hidden_states[:, level], text_hidden_state=enc_text_hidden_state, - text_token_mask=text_token_mask, + text_token_mask=attention_mask.bool(), ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: @@ -3609,6 +3654,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ + config_class = GroundingDINOTextPrenetConfig def __init__(self, config, add_pooling_layer=True): super().__init__(config) From 6f08b04abbf1c81d7f6f6da650b79e8bc0d70e31 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:26:14 -0300 Subject: [PATCH 035/252] Renamed parameters with gamma in the name as it's currently not allowed --- .../models/grounding_dino/modeling_grounding_dino.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index d75db4735ad30a..71e7cb33fba0b9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1107,8 +1107,8 @@ def __init__(self, config, init_values=1e-4): # add layer scale for training stability self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) - self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) @@ -1119,8 +1119,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at vision_attention_mask=attention_mask_vision, text_attention_mask=attention_mask_text, ) - vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) - text_features = text_features + self.drop_path(self.gamma_l * delta_t) + vision_features = vision_features + self.drop_path(self.vision_param * delta_v) + text_features = text_features + self.drop_path(self.text_param * delta_t) return (vision_features, vision_attn), (text_features, text_attn) From 7666253ac7b1ceba183ab2c55a31ef6713ca13d4 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:27:04 -0300 Subject: [PATCH 036/252] Removed tokenization and image_processing scripts since we'll map from existing models --- .../image_processing_grounding_dino.py | 967 ------------------ .../tokenization_grounding_dino.py | 0 2 files changed, 967 deletions(-) delete mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py delete mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py deleted file mode 100644 index 1adf8e8e0dcd62..00000000000000 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ /dev/null @@ -1,967 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Image processor class for Deformable DETR.""" - -import io -import pathlib -from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union - -import numpy as np - -from ...feature_extraction_utils import BatchFeature -from ...image_processing_utils import BaseImageProcessor, get_size_dict -from ...image_transforms import ( - PaddingMode, - center_to_corners_format, - corners_to_center_format, - id_to_rgb, - pad, - rescale, - resize, - rgb_to_id, - to_channel_dimension_format, -) -from ...image_utils import ( - IMAGENET_DEFAULT_MEAN, - IMAGENET_DEFAULT_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_coco_detection_annotations, - valid_images, -) -from ...utils import ( - ExplicitEnum, - TensorType, - is_flax_available, - is_jax_tensor, - is_scipy_available, - is_tf_available, - is_tf_tensor, - is_torch_available, - is_torch_tensor, - is_vision_available, - logging, -) - - -if is_torch_available(): - import torch - from torch import nn - - -if is_vision_available(): - import PIL - -if is_scipy_available(): - import scipy.special - import scipy.stats - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -AnnotationType = Dict[str, Union[int, str, List[Dict]]] - - -class AnnotionFormat(ExplicitEnum): - COCO_DETECTION = "coco_detection" - - -SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION) - - -# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio -def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: - """ - Computes the output image size given the input image size and the desired output size. - - Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): - The desired output size. - max_size (`int`, *optional*): - The maximum allowed output size. - """ - height, width = image_size - if max_size is not None: - min_original_size = float(min((height, width))) - max_original_size = float(max((height, width))) - if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) - - if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: - ow = size - oh = int(size * height / width) - else: - oh = size - ow = int(size * width / height) - return (oh, ow) - - -# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size -def get_resize_output_image_size( - input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int]], - max_size: Optional[int] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, -) -> Tuple[int, int]: - """ - Computes the output image size given the input image size and the desired output size. If the desired output size - is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output - image size is computed by keeping the aspect ratio of the input image size. - - Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): - The desired output size. - max_size (`int`, *optional*): - The maximum allowed output size. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred from the input image. - """ - image_size = get_image_size(input_image, input_data_format) - if isinstance(size, (list, tuple)): - return size - - return get_size_with_aspect_ratio(image_size, size, max_size) - - -# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn -def get_numpy_to_framework_fn(arr) -> Callable: - """ - Returns a function that converts a numpy array to the framework of the input array. - - Args: - arr (`np.ndarray`): The array to convert. - """ - if isinstance(arr, np.ndarray): - return np.array - if is_tf_available() and is_tf_tensor(arr): - import tensorflow as tf - - return tf.convert_to_tensor - if is_torch_available() and is_torch_tensor(arr): - import torch - - return torch.tensor - if is_flax_available() and is_jax_tensor(arr): - import jax.numpy as jnp - - return jnp.array - raise ValueError(f"Cannot convert arrays of type {type(arr)}") - - -# Copied from transformers.models.detr.image_processing_detr.safe_squeeze -def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: - """ - Squeezes an array, but only if the axis specified has dim 1. - """ - if axis is None: - return arr.squeeze() - - try: - return arr.squeeze(axis=axis) - except ValueError: - return arr - - -# Copied from transformers.models.detr.image_processing_detr.normalize_annotation -def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: - image_height, image_width = image_size - norm_annotation = {} - for key, value in annotation.items(): - if key == "boxes": - boxes = value - boxes = corners_to_center_format(boxes) - boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) - norm_annotation[key] = boxes - else: - norm_annotation[key] = value - return norm_annotation - - -# Copied from transformers.models.detr.image_processing_detr.max_across_indices -def max_across_indices(values: Iterable[Any]) -> List[Any]: - """ - Return the maximum value across all indices of an iterable of values. - """ - return [max(values_i) for values_i in zip(*values)] - - -# Copied from transformers.models.detr.image_processing_detr.get_max_height_width -def get_max_height_width( - images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None -) -> List[int]: - """ - Get the maximum height and width across all images in a batch. - """ - if input_data_format is None: - input_data_format = infer_channel_dimension_format(images[0]) - - if input_data_format == ChannelDimension.FIRST: - _, max_height, max_width = max_across_indices([img.shape for img in images]) - elif input_data_format == ChannelDimension.LAST: - max_height, max_width, _ = max_across_indices([img.shape for img in images]) - else: - raise ValueError(f"Invalid channel dimension format: {input_data_format}") - return (max_height, max_width) - - -# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask -def make_pixel_mask( - image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None -) -> np.ndarray: - """ - Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. - - Args: - image (`np.ndarray`): - Image to make the pixel mask for. - output_size (`Tuple[int, int]`): - Output size of the mask. - """ - input_height, input_width = get_image_size(image, channel_dim=input_data_format) - mask = np.zeros(output_size, dtype=np.int64) - mask[:input_height, :input_width] = 1 - return mask - -def prepare_coco_detection_annotation( - image, - target, - input_data_format: Optional[Union[ChannelDimension, str]] = None, -): - """ - Convert the target in COCO format into the format expected by GroundingDINO. - """ - image_height, image_width = get_image_size(image, channel_dim=input_data_format) - - image_id = target["image_id"] - image_id = np.asarray([image_id], dtype=np.int64) - - # Get all COCO annotations for the given image. - annotations = target["annotations"] - annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] - - classes = [obj["category_id"] for obj in annotations] - classes = np.asarray(classes, dtype=np.int64) - - # for conversion to coco api - area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) - iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) - - boxes = [obj["bbox"] for obj in annotations] - # guard against no boxes via resizing - boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) - boxes[:, 2:] += boxes[:, :2] - boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) - boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) - - keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) - - new_target = {} - new_target["image_id"] = image_id - new_target["class_labels"] = classes[keep] - new_target["boxes"] = boxes[keep] - new_target["area"] = area[keep] - new_target["iscrowd"] = iscrowd[keep] - new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) - - if annotations and "keypoints" in annotations[0]: - keypoints = [obj["keypoints"] for obj in annotations] - keypoints = np.asarray(keypoints, dtype=np.float32) - num_keypoints = keypoints.shape[0] - keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints - new_target["keypoints"] = keypoints[keep] - - return new_target - -# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities -def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - probs = scipy.special.softmax(logits, axis=-1) - labels = probs.argmax(-1, keepdims=True) - scores = np.take_along_axis(probs, labels, axis=-1) - scores, labels = scores.squeeze(-1), labels.squeeze(-1) - return scores, labels - -# Copied from transformers.models.detr.image_processing_detr.resize_annotation -def resize_annotation( - annotation: Dict[str, Any], - orig_size: Tuple[int, int], - target_size: Tuple[int, int], - threshold: float = 0.5, - resample: PILImageResampling = PILImageResampling.NEAREST, -): - """ - Resizes an annotation to a target size. - - Args: - annotation (`Dict[str, Any]`): - The annotation dictionary. - orig_size (`Tuple[int, int]`): - The original size of the input image. - target_size (`Tuple[int, int]`): - The target size of the image, as returned by the preprocessing `resize` step. - threshold (`float`, *optional*, defaults to 0.5): - The threshold used to binarize the segmentation masks. - resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): - The resampling filter to use when resizing the masks. - """ - ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) - ratio_height, ratio_width = ratios - - new_annotation = {} - new_annotation["size"] = target_size - - for key, value in annotation.items(): - if key == "boxes": - boxes = value - scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) - new_annotation["boxes"] = scaled_boxes - elif key == "area": - area = value - scaled_area = area * (ratio_width * ratio_height) - new_annotation["area"] = scaled_area - elif key == "masks": - masks = value[:, None] - masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) - masks = masks.astype(np.float32) - masks = masks[:, 0] > threshold - new_annotation["masks"] = masks - elif key == "size": - new_annotation["size"] = target_size - else: - new_annotation[key] = value - - return new_annotation - - -class GroundingDINOImageProcessor(BaseImageProcessor): - r""" - Constructs a Grounding DINO image processor. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be - overridden by the `do_resize` parameter in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): - Resampling filter to use if resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the - `do_rescale` parameter in the `preprocess` method. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the - `preprocess` method. - do_normalize: - Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the - `preprocess` method. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): - Mean values to use when normalizing the image. Can be a single value or a list of values, one for each - channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): - Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one - for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. - do_pad (`bool`, *optional*, defaults to `True`): - Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be - overridden by the `do_pad` parameter in the `preprocess` method. - """ - - model_input_names = ["pixel_values", "pixel_mask"] - - def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BILINEAR, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Union[float, List[float]] = None, - image_std: Union[float, List[float]] = None, - do_pad: bool = True, - **kwargs, - ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - - size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} - size = get_size_dict(size, max_size=max_size, default_to_square=False) - - super().__init__(**kwargs) - self.format = format - self.do_resize = do_resize - self.size = size - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN - self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad - - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO - def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - - def prepare_annotation( - self, - image: np.ndarray, - target: Dict, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> Dict: - """ - Prepare an annotation for feeding into Grounding DINO model. - """ - target = prepare_coco_detection_annotation( - image, target, input_data_format=input_data_format - ) - - return target - - def prepare(self, image, target): - logger.warning_once( - "The `prepare` method is deprecated and will be removed in a v4.33. " - "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " - "does not return the image anymore.", - ) - target = self.prepare_annotation(image, target) - return image, target - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize - def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BILINEAR, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> np.ndarray: - """ - Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an - int, smaller edge of the image will be matched to this number. - - Args: - image (`np.ndarray`): - Image to resize. - size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): - Resampling filter to use if resizing the image. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) - if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( - image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format - ) - elif "height" in size and "width" in size: - size = (size["height"], size["width"]) - else: - raise ValueError( - "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" - f" {size.keys()}." - ) - image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs - ) - return image - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation - def resize_annotation( - self, - annotation, - orig_size, - size, - resample: PILImageResampling = PILImageResampling.NEAREST, - ) -> Dict: - """ - Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched - to this number. - """ - return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale - def rescale( - self, - image: np.ndarray, - rescale_factor: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.ndarray: - """ - Rescale the image by the given factor. image = image * rescale_factor. - - Args: - image (`np.ndarray`): - Image to rescale. - rescale_factor (`float`): - The value to use for rescaling. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the input image. If unset, is inferred from the input image. Can be - one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation - def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: - """ - Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to - `[center_x, center_y, width, height]` format. - """ - return normalize_annotation(annotation, image_size=image_size) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image - def _pad_image( - self, - image: np.ndarray, - output_size: Tuple[int, int], - constant_values: Union[float, Iterable[float]] = 0, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.ndarray: - """ - Pad an image with zeros to the given size. - """ - input_height, input_width = get_image_size(image, channel_dim=input_data_format) - output_height, output_width = output_size - - pad_bottom = output_height - input_height - pad_right = output_width - input_width - padding = ((0, pad_bottom), (0, pad_right)) - padded_image = pad( - image, - padding, - mode=PaddingMode.CONSTANT, - constant_values=constant_values, - data_format=data_format, - input_data_format=input_data_format, - ) - return padded_image - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad - def pad( - self, - images: List[np.ndarray], - constant_values: Union[float, Iterable[float]] = 0, - return_pixel_mask: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> BatchFeature: - """ - Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width - in the batch and optionally returns their corresponding pixel mask. - - Args: - image (`np.ndarray`): - Image to pad. - constant_values (`float` or `Iterable[float]`, *optional*): - The value to use for the padding if `mode` is `"constant"`. - return_pixel_mask (`bool`, *optional*, defaults to `True`): - Whether to return a pixel mask. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) - - padded_images = [ - self._pad_image( - image, - pad_size, - constant_values=constant_values, - data_format=data_format, - input_data_format=input_data_format, - ) - for image in images - ] - data = {"pixel_values": padded_images} - - if return_pixel_mask: - masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) - for image in images - ] - data["pixel_mask"] = masks - - return BatchFeature(data=data, tensor_type=return_tensors) - - def preprocess( - self, - images: ImageInput, - annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - resample=None, # PILImageResampling - do_rescale: Optional[bool] = None, - rescale_factor: Optional[Union[int, float]] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_pad: Optional[bool] = None, - return_tensors: Optional[Union[TensorType, str]] = None, - data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> BatchFeature: - """ - Preprocess an image or a batch of images so that it can be used by the model. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. - annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - do_resize (`bool`, *optional*, defaults to self.do_resize): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. - resample (`PILImageResampling`, *optional*, defaults to self.resample): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to self.do_rescale): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to self.rescale_factor): - Rescale factor to use when rescaling the image. - do_normalize (`bool`, *optional*, defaults to self.do_normalize): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): - Mean to use when normalizing the image. - image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): - Standard deviation to use when normalizing the image. - do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. - return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): - Type of tensors to return. If `None`, will return the list of images. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - max_size = None - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") - - do_resize = self.do_resize if do_resize is None else do_resize - size = self.size if size is None else size - size = get_size_dict(size=size, max_size=max_size, default_to_square=False) - resample = self.resample if resample is None else resample - do_rescale = self.do_rescale if do_rescale is None else do_rescale - rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor - do_normalize = self.do_normalize if do_normalize is None else do_normalize - image_mean = self.image_mean if image_mean is None else image_mean - image_std = self.image_std if image_std is None else image_std - do_pad = self.do_pad if do_pad is None else do_pad - - if do_resize is not None and size is None: - raise ValueError("Size and max_size must be specified if do_resize is True.") - - if do_rescale is not None and rescale_factor is None: - raise ValueError("Rescale factor must be specified if do_rescale is True.") - - if do_normalize is not None and (image_mean is None or image_std is None): - raise ValueError("Image mean and std must be specified if do_normalize is True.") - - images = make_list_of_images(images) - if annotations is not None and isinstance(annotations, dict): - annotations = [annotations] - - if annotations is not None and len(images) != len(annotations): - raise ValueError( - f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." - ) - - if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - if not valid_coco_detection_annotations(annotations): - raise ValueError( - "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts" - "(batch of images) with the following keys: `image_id` and `annotations`, with the latter " - "being a list of annotations in the COCO format." - ) - - # All transformations expect numpy arrays - images = [to_numpy_array(image) for image in images] - - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) - if annotations is not None: - prepared_images = [] - prepared_annotations = [] - for image, target in zip(images, annotations): - target = self.prepare_annotation( - image, - target, - input_data_format=input_data_format, - ) - prepared_images.append(image) - prepared_annotations.append(target) - images = prepared_images - annotations = prepared_annotations - del prepared_images, prepared_annotations - - # transformations - if do_resize: - if annotations is not None: - resized_images, resized_annotations = [], [] - for image, target in zip(images, annotations): - orig_size = get_image_size(image, input_data_format) - resized_image = self.resize( - image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format - ) - resized_annotation = self.resize_annotation( - target, orig_size, get_image_size(resized_image, input_data_format) - ) - resized_images.append(resized_image) - resized_annotations.append(resized_annotation) - images = resized_images - annotations = resized_annotations - del resized_images, resized_annotations - else: - images = [ - self.resize(image, size=size, resample=resample, input_data_format=input_data_format) - for image in images - ] - - if do_rescale: - images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] - - if do_normalize: - images = [ - self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images - ] - if annotations is not None: - annotations = [ - self.normalize_annotation(annotation, get_image_size(image, input_data_format)) - for annotation, image in zip(annotations, images) - ] - - if do_pad: - # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} - data = self.pad( - images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format - ) - else: - images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - for image in images - ] - data = {"pixel_values": images} - - encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) - if annotations is not None: - encoded_inputs["labels"] = [ - BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations - ] - - return encoded_inputs - - # POSTPROCESSING METHODS - TODO: add support for other frameworks - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`GroundingDINOForObjectDetection`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - - def post_process_object_detection( - self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100 - ): - """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`GroundingDINOForObjectDetection`]): - Raw outputs of the model. - threshold (`float`, *optional*): - Score threshold to keep object detection predictions. - target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. - top_k (`int`, *optional*, defaults to 100): - Keep only top k bounding boxes before filtering by thresholding. - - Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if target_sizes is not None: - if len(out_logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) - - prob = out_logits.sigmoid() - prob = prob.view(out_logits.shape[0], -1) - k_value = min(top_k, prob.size(1)) - topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) - - return results diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 From 046e0c5ed5dd4e6edd5a29b56976e1ca318c5385 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:28:13 -0300 Subject: [PATCH 037/252] Fixed some issues with configuration --- .../configuration_grounding_dino.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index fbd0d483b48e45..e900714852fbaa 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Grounding DINO model configuration""" +import os +from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -25,7 +27,7 @@ "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } -# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet +# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet class GroundingDINOTextPrenetConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a @@ -134,6 +136,24 @@ def __init__( self.use_cache = use_cache self.classifier_dropout = classifier_dropout + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the text config dict if we are loading from CLIPSegConfig + if config_dict.get("model_type") == "grounding-dino": + config_dict = config_dict["text_backbone_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + class GroundingDINOConfig(PretrainedConfig): r""" @@ -289,7 +309,6 @@ def __init__( text_backbone_config=None, num_channels=3, num_queries=900, - max_position_embeddings=1024, encoder_layers=6, encoder_ffn_dim=2048, encoder_attention_heads=8, @@ -352,7 +371,6 @@ def __init__( self.backbone_config = backbone_config self.num_channels = num_channels self.num_queries = num_queries - self.max_position_embeddings = max_position_embeddings self.d_model = d_model self.encoder_ffn_dim = encoder_ffn_dim self.encoder_layers = encoder_layers @@ -391,7 +409,7 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config + self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config) self.max_text_len = max_text_len # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout From 70b248dfd515ad27f6d81758ddaa9992096fed98 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:32:22 -0300 Subject: [PATCH 038/252] Just some modifications on conversion script --- .../convert_grounding_dino_to_hf.py | 89 ++++--------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index ed16da3f0c4617..680c3872bf68dc 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -159,8 +159,8 @@ def create_rename_keys(state_dict, config): 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', } fusion_key_mappings = { - 'gamma_v': 'fusion_layer.gamma_v', - 'gamma_l': 'fusion_layer.gamma_l', + 'gamma_v': 'fusion_layer.vision_param', + 'gamma_l': 'fusion_layer.text_param', 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', @@ -326,66 +326,11 @@ def preprocess_caption(caption: str) -> str: return result return result + "." - def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: - """Generate attention mask between each pair of special tokens - Args: - input_ids (torch.Tensor): input ids. Shape: [bs, num_token] - special_tokens_mask (list): special tokens mask. - Returns: - torch.Tensor: attention mask between each special tokens. - """ - input_ids = tokenized["input_ids"] - bs, num_token = input_ids.shape - # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens - special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() - for special_token in special_tokens_list: - special_tokens_mask |= input_ids == special_token - - # idxs: each row is a list of indices of special tokens - idxs = torch.nonzero(special_tokens_mask) - - # generate attention mask and positional ids - attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) - position_ids = torch.zeros((bs, num_token), device=input_ids.device) - previous_col = 0 - for i in range(idxs.shape[0]): - row, col = idxs[i] - if (col == 0) or (col == num_token - 1): - attention_mask[row, col, col] = True - position_ids[row, col] = 0 - else: - attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True - position_ids[row, previous_col + 1 : col + 1] = torch.arange( - 0, col - previous_col, device=input_ids.device - ) - - previous_col = col - - return attention_mask, position_ids.to(torch.long) - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer - special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") - text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( - tokenized, special_tokens - ) - - max_text_len = config.max_text_len - if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] - position_ids = position_ids[:, :max_text_len] - tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] - - # extract text embeddings - tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} - tokenized_for_encoder["attention_mask"] = text_self_attention_masks - tokenized_for_encoder["position_ids"] = position_ids - - return tokenized_for_encoder, tokenized.attention_mask.bool() + return tokenized @torch.no_grad() def convert_grounding_dino_checkpoint(args): @@ -415,7 +360,8 @@ def convert_grounding_dino_checkpoint(args): read_in_q_k_v(new_state_dict, config) # Load HF implementation with default config and converted state dict - model = GroundingDINOForObjectDetection(config).eval() + model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval() + # model = GroundingDINOForObjectDetection(config=config).eval() model.load_state_dict(new_state_dict, strict=False) # Load and process test image @@ -425,19 +371,24 @@ def convert_grounding_dino_checkpoint(args): [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)] ) image_inputs = image_processor(image) - text_inputs, text_token_mask = text_processor(text, config) + text_inputs = text_processor(text, config) # Running forward output = model( pixel_values=image_inputs.unsqueeze(0), - input_ids=text_inputs["input_ids"], - attention_mask=text_inputs["attention_mask"], - token_type_ids=text_inputs["token_type_ids"], - text_token_mask=text_token_mask, - text_self_attention_masks=text_inputs["attention_mask"], - position_ids=text_inputs["position_ids"], + **text_inputs ) + # output.pred_boxes[:, :3, :] + # tensor([[[0.7674, 0.4136, 0.4572, 0.7305], + # [0.2566, 0.5463, 0.4760, 0.8777], + # [0.2585, 0.5442, 0.4640, 0.8683]]]) + # + # output.logits[:, :3, :4] + # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374], + # [-4.9652, -0.3719, -0.3950, -4.2315], + # [-5.9599, -3.3765, -3.3104, -5.9752]]]) + if pytorch_dump_folder_path is not None: print(f"Saving model {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) @@ -448,10 +399,6 @@ def convert_grounding_dino_checkpoint(args): if push_to_hub: print(f"Pushing model and image processor for {model_name} to hub") model.push_to_hub(f"EduardoPacheco/{model_name}") - #TODO push image processor to hub - # image_processor.push_to_hub(f"microsoft/{model_name}") - #TODO push tokenizer to hub - #TODO push processor to hub if __name__ == "__main__": @@ -459,7 +406,7 @@ def convert_grounding_dino_checkpoint(args): # Required parameters parser.add_argument( "--model_name", - default="grounding-dino-base", + default="grounding-dino-tiny", type=str, choices=["grounding-dino-tiny", "grounding-dino-base"], help="Name of the GroundingDINO model you'd like to convert.", From 3bc92b7688d531b8bd7e2ddf9708b08d6144fee6 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:33:52 -0300 Subject: [PATCH 039/252] Other modifications --- src/transformers/__init__.py | 4 ++-- src/transformers/models/grounding_dino/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4ea2c3ace121ea..1775754773a314 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -357,7 +357,7 @@ "GPTSanJapaneseTokenizer", ], "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], - "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroupViTConfig", @@ -4413,7 +4413,7 @@ GPTSanJapaneseTokenizer, ) from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig - from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GroupViTConfig, diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index e3767e017d1023..df2b0d907f1b65 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -18,7 +18,7 @@ _import_structure = { - "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], + "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], } try: @@ -36,7 +36,7 @@ if TYPE_CHECKING: - from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig try: if not is_torch_available(): From 4cae0ca71fa0564e86b1b448359ca2bc5a5e924c Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 22 Aug 2023 18:50:32 -0300 Subject: [PATCH 040/252] Copied deformable detr --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/grounding-dino.md | 48 + src/transformers/__init__.py | 16 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/feature_extraction_auto.py | 1 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/grounding_dino/__init__.py | 57 + .../configuration_grounding_dino.py | 262 ++ .../convert_grounding_dino_to_pytorch.py | 237 ++ .../models/grounding_dino/load_custom.py | 49 + .../grounding_dino/modeling_grounding_dino.py | 2513 +++++++++++++++++ tests/models/grounding_dino/__init__.py | 0 .../test_modeling_grounding_dino.py | 673 +++++ 15 files changed, 3865 insertions(+) create mode 100644 docs/source/en/model_doc/grounding-dino.md create mode 100644 src/transformers/models/grounding_dino/__init__.py create mode 100644 src/transformers/models/grounding_dino/configuration_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py create mode 100644 src/transformers/models/grounding_dino/load_custom.py create mode 100644 src/transformers/models/grounding_dino/modeling_grounding_dino.py create mode 100644 tests/models/grounding_dino/__init__.py create mode 100644 tests/models/grounding_dino/test_modeling_grounding_dino.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7fc6ebf7d851b1..b80f2f093699a5 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -523,6 +523,8 @@ title: FocalNet - local: model_doc/glpn title: GLPN + - local: model_doc/grounding-dino + title: Grounding DINO - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md new file mode 100644 index 00000000000000..161a90609174b3 --- /dev/null +++ b/docs/source/en/model_doc/grounding-dino.md @@ -0,0 +1,48 @@ + + +# Grounding DINO + +## Overview + +The Grounding DINO model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## GroundingDINOConfig + +[[autodoc]] GroundingDINOConfig + +## GroundingDINOModel + +[[autodoc]] GroundingDINOModel + - forward + +## GroundingDINOForObjectDetection + +[[autodoc]] GroundingDINOForObjectDetection + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a68a492676eac5..ff461296c5e76e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -275,6 +275,7 @@ "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"], "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"], "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], "models.deprecated": [], "models.deprecated.bort": [], @@ -1591,6 +1592,14 @@ "DeformableDetrPreTrainedModel", ] ) + _import_structure["models.grounding_dino"].extend( + [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + ) _import_structure["models.deit"].extend( [ "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4428,6 +4437,7 @@ DecisionTransformerConfig, ) from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig from .models.deprecated.mctct import ( MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -5583,6 +5593,12 @@ DeformableDetrModel, DeformableDetrPreTrainedModel, ) + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) from .models.deit import ( DEIT_PRETRAINED_MODEL_ARCHIVE_LIST, DeiTForImageClassification, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index b4486039b989da..cf718e4453f79d 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -61,6 +61,7 @@ deberta_v2, decision_transformer, deformable_detr, + grounding_dino, deit, deprecated, deta, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 5690359643c8e8..ca005bbc79df90 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -73,6 +73,7 @@ ("deberta-v2", "DebertaV2Config"), ("decision_transformer", "DecisionTransformerConfig"), ("deformable_detr", "DeformableDetrConfig"), + ("grounding-dino", "GroundingDINOConfig"), ("deit", "DeiTConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), @@ -287,6 +288,7 @@ ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -492,6 +494,7 @@ ("deberta-v2", "DeBERTa-v2"), ("decision_transformer", "Decision Transformer"), ("deformable_detr", "Deformable DETR"), + ("grounding-dino", "Grounding DINO"), ("deit", "DeiT"), ("deplot", "DePlot"), ("deta", "DETA"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index befca6a64b81b7..5bc4db87f7048b 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -50,6 +50,7 @@ ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("data2vec-vision", "BeitFeatureExtractor"), ("deformable_detr", "DeformableDetrFeatureExtractor"), + ("grounding-dino", "GroundingDINOFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), ("dinat", "ViTFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 13bb3a6e5d8a8f..a791255829287d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -53,6 +53,7 @@ ("cvt", "ConvNextImageProcessor"), ("data2vec-vision", "BeitImageProcessor"), ("deformable_detr", "DeformableDetrImageProcessor"), + ("grounding-dino", "GroundingDINOImageProcessor"), ("deit", "DeiTImageProcessor"), ("deta", "DetaImageProcessor"), ("detr", "DetrImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index bbbaa58d6ec0e6..842af5c5272abc 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -71,6 +71,7 @@ ("deberta-v2", "DebertaV2Model"), ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), + ("grounding-dino", "GroundingDINOModel"), ("deit", "DeiTModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), @@ -629,6 +630,7 @@ # Model for Object Detection mapping ("conditional_detr", "ConditionalDetrForObjectDetection"), ("deformable_detr", "DeformableDetrForObjectDetection"), + ("grounding-dino", "GroundingDINOForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py new file mode 100644 index 00000000000000..e3767e017d1023 --- /dev/null +++ b/src/transformers/models/grounding_dino/__init__.py @@ -0,0 +1,57 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_grounding_dino"] = [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py new file mode 100644 index 00000000000000..0b3ae3d74d3475 --- /dev/null +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -0,0 +1,262 @@ +# coding=utf-8 +# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Grounding DINO model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + +GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", +} + + + +class GroundingDINOConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate + a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Grounding DINO + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use + `two_stage_num_proposals` instead. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional + backbone from the timm package. For a list of all available models, see [this + page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `False`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `False`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + + Examples: + + ```python + >>> from transformers import GroundingDINOConfig, GroundingDINOModel + + >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration + >>> configuration = GroundingDINOConfig() + + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> model = GroundingDINOModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "grounding-dino" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + use_timm_backbone=True, + backbone_config=None, + num_channels=3, + num_queries=300, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=1024, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + return_intermediate=True, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + dilation=False, + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=False, + two_stage_num_proposals=300, + with_box_refine=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=False, + **kwargs, + ): + if backbone_config is not None and use_timm_backbone: + raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") + + if not use_timm_backbone: + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_channels = num_channels + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.dilation = dilation + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py new file mode 100644 index 00000000000000..d3cef0366b2bca --- /dev/null +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Grounding DINO checkpoints.""" + + +import argparse +import json +from pathlib import Path + +import requests +import torch +from huggingface_hub import cached_download, hf_hub_url +from PIL import Image + +from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def rename_key(orig_key): + if "backbone.0.body" in orig_key: + orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") + if "transformer" in orig_key: + orig_key = orig_key.replace("transformer.", "") + if "norm1" in orig_key: + if "encoder" in orig_key: + orig_key = orig_key.replace("norm1", "self_attn_layer_norm") + else: + orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") + if "norm2" in orig_key: + if "encoder" in orig_key: + orig_key = orig_key.replace("norm2", "final_layer_norm") + else: + orig_key = orig_key.replace("norm2", "self_attn_layer_norm") + if "norm3" in orig_key: + orig_key = orig_key.replace("norm3", "final_layer_norm") + if "linear1" in orig_key: + orig_key = orig_key.replace("linear1", "fc1") + if "linear2" in orig_key: + orig_key = orig_key.replace("linear2", "fc2") + if "query_embed" in orig_key: + orig_key = orig_key.replace("query_embed", "query_position_embeddings") + if "cross_attn" in orig_key: + orig_key = orig_key.replace("cross_attn", "encoder_attn") + + return orig_key + + +def read_in_q_k_v(state_dict): + # transformer decoder self-attention layers + for i in range(6): + # read in weights + bias of input projection layer of self-attention + in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] + state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] + state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] + state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def convert_grounding_dino_checkpoint( + checkpoint_path, + single_scale, + dilation, + with_box_refine, + two_stage, + pytorch_dump_folder_path, + push_to_hub, +): + """ + Copy/paste/tweak model's weights to our Grounding DINO structure. + """ + + # load default config + config = GroundingDINOConfig() + # set config attributes + if single_scale: + config.num_feature_levels = 1 + config.dilation = dilation + config.with_box_refine = with_box_refine + config.two_stage = two_stage + # set labels + config.num_labels = 91 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + # load image processor + image_processor = DeformableDetrImageProcessor(format="coco_detection") + + # prepare image + img = prepare_img() + encoding = image_processor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + logger.info("Converting model...") + + # load original state dict + state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + # rename keys + for key in state_dict.copy().keys(): + val = state_dict.pop(key) + state_dict[rename_key(key)] = val + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + prefix = "model." + for key in state_dict.copy().keys(): + if not key.startswith("class_embed") and not key.startswith("bbox_embed"): + val = state_dict.pop(key) + state_dict[prefix + key] = val + # finally, create HuggingFace model and load state dict + model = GroundingDINOForObjectDetection(config) + model.load_state_dict(state_dict) + model.eval() + + device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(device) + # verify our conversion + outputs = model(pixel_values.to(device)) + + expected_logits = torch.tensor( + [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] + ) + expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) + + if single_scale: + expected_logits = torch.tensor( + [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + ) + expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) + + if single_scale and dilation: + expected_logits = torch.tensor( + [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] + ) + expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) + + if with_box_refine: + expected_logits = torch.tensor( + [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] + ) + expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) + + if with_box_refine and two_stage: + expected_logits = torch.tensor( + [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] + ) + expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) + + print("Logits:", outputs.logits[0, :3, :3]) + + assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) + + print("Everything ok!") + + # Save model and image processor + logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + image_processor.save_pretrained(pytorch_dump_folder_path) + + # Push to hub + if push_to_hub: + model_name = "deformable-detr" + model_name += "-single-scale" if single_scale else "" + model_name += "-dc5" if dilation else "" + model_name += "-with-box-refine" if with_box_refine else "" + model_name += "-two-stage" if two_stage else "" + print("Pushing model to hub...") + model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_path", + type=str, + default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth", + help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", + ) + parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") + parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") + parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") + parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the folder to output PyTorch model.", + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + args = parser.parse_args() + convert_grounding_dino_checkpoint( + args.checkpoint_path, + args.single_scale, + args.dilation, + args.with_box_refine, + args.two_stage, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py new file mode 100644 index 00000000000000..97b8f09fb5f446 --- /dev/null +++ b/src/transformers/models/grounding_dino/load_custom.py @@ -0,0 +1,49 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Loading of Grounding DINO's CUDA kernels""" +import os +from pathlib import Path + + +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + import MultiScaleDeformableAttention as MSDA + + return MSDA diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py new file mode 100644 index 00000000000000..ee80a562e4b851 --- /dev/null +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -0,0 +1,2513 @@ +# coding=utf-8 +# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Grounding DINO model.""" + + +import copy +import math +import warnings +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_timm_available, + is_torch_cuda_available, + is_vision_available, + replace_return_docstrings, + requires_backends, +) +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import is_ninja_available, logging +from ..auto import AutoBackbone +from .configuration_grounding_dino import GroundingDINOConfig +from .load_custom import load_cuda_kernels + + +logger = logging.get_logger(__name__) + +# Move this to not compile only when importing, this needs to happen later, like in __init__. +if is_torch_cuda_available() and is_ninja_available(): + logger.info("Loading custom CUDA kernels...") + try: + MultiScaleDeformableAttention = load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + MultiScaleDeformableAttention = None +else: + MultiScaleDeformableAttention = None + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + + +class MultiScaleDeformableAttentionFunction(Function): + @staticmethod + def forward( + context, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + context.im2col_step = im2col_step + output = MultiScaleDeformableAttention.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + context.im2col_step, + ) + context.save_for_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + ) + return output + + @staticmethod + @once_differentiable + def backward(context, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = context.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + context.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_timm_available(): + from timm import create_model + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "GroundingDINOConfig" +_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny" + +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "idea-research/grg-dino-tiny", + # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino +] + + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO +class GroundingDINODecoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDINODecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOModelOutput(ModelOutput): + """ + Base class for outputs of the Grounding DINO encoder-decoder model. + + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + init_reference_points: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO +class GroundingDINOObjectDetectionOutput(ModelOutput): + """ + Output type of [`GroundingDINOForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional = None + enc_outputs_coord_logits: Optional = None + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO +class GroundingDINOFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = GroundingDINOFrozenBatchNorm2d(module.num_features) + + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO +class GroundingDINOConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + kwargs = {} + if config.dilation: + kwargs["output_stride"] = 16 + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,), + in_chans=config.num_channels, + **kwargs, + ) + else: + backbone = AutoBackbone.from_config(config.backbone_config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO +class GroundingDINOConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +# Copied from transformers.models.detr.modeling_detr._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None): + """ + Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`. + """ + batch_size, source_len = mask.size() + target_len = target_len if target_len is not None else source_len + + expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO +class GroundingDINOSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding +class GroundingDINOLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +def multi_scale_deformable_attention( + value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Grounding DINO. + """ + + def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): + super().__init__() + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + self._reset_parameters() + + def _reset_parameters(self): + nn.init.constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.n_heads, 1, 1, 2) + .repeat(1, self.n_levels, self.n_points, 1) + ) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(self.attention_weights.weight.data, 0.0) + nn.init.constant_(self.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(self.value_proj.weight.data) + nn.init.constant_(self.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(self.output_proj.weight.data) + nn.init.constant_(self.output_proj.bias.data, 0.0) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + if self.disable_custom_kernels: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + + return output, attention_weights + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINOMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO +class GroundingDINOEncoderLayer(nn.Module): + def __init__(self, config: GroundingDINOConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = GroundingDINOMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO +class GroundingDINODecoderLayer(nn.Module): + def __init__(self, config: GroundingDINOConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = GroundingDINOMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = GroundingDINOMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead +class GroundingDINOClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO +class GroundingDINOPreTrainedModel(PreTrainedModel): + config_class = GroundingDINOConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, GroundingDINOLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): + module._reset_parameters() + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GroundingDINODecoder): + module.gradient_checkpointing = value + + +GROUNDING_DINO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GroundingDINOConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GROUNDING_DINO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`] + for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO +class GroundingDINOEncoder(GroundingDINOPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`GroundingDINOEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: GroundingDINOConfig + """ + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO +class GroundingDINODecoder(GroundingDINOPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Grounding DINO: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: GroundingDINOConfig + """ + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Grounding DINO + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + else: + if reference_points.shape[-1] != 2: + raise ValueError("Reference points' last dimension must be of size 2") + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + encoder_hidden_states, + encoder_attention_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + if reference_points.shape[-1] == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + if reference_points.shape[-1] != 2: + raise ValueError( + f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}" + ) + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return GroundingDINODecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + GROUNDING_DINO_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO +class GroundingDINOModel(GroundingDINOPreTrainedModel): + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = GroundingDINOConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = GroundingDINOEncoder(config) + self.decoder = GroundingDINODecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_heigth = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GroundingDINOModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation for two-stage Grounding DINO + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return GroundingDINOModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +@add_start_docstrings( + """ + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + GROUNDING_DINO_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO +class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + + def __init__(self, config: GroundingDINOConfig): + super().__init__(config) + + # Grounding DINO encoder-decoder model + self.model = GroundingDINOModel(config) + + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = GroundingDINOMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + for box_embed in self.bbox_embed: + nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + # Initialize weights and apply final processing + self.post_init() + + # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + pixel_mask=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = GroundingDINOHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = GroundingDINOLoss( + matcher=matcher, + num_classes=self.config.num_labels, + focal_alpha=self.config.focal_alpha, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + if self.config.two_stage: + enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid() + outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord} + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = GroundingDINOObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs + + +# Copied from transformers.models.detr.modeling_detr.dice_loss +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO +class GroundingDINOLoss(nn.Module): + """ + This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we + compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of + matched ground-truth / prediction (supervise class and box). + + Args: + matcher (`GroundingDINOHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + focal_alpha (`float`): + Alpha parameter in focal loss. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__(self, matcher, num_classes, focal_alpha, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # (Niels): comment out function below, distributed training to be added + # if is_dist_avail_and_initialized(): + # torch.distributed.all_reduce(num_boxes) + # (Niels) in original implementation, num_boxes is divided by get_world_size() + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + if "enc_outputs" in outputs: + enc_outputs = outputs["enc_outputs"] + bin_targets = copy.deepcopy(targets) + for bt in bin_targets: + bt["class_labels"] = torch.zeros_like(bt["class_labels"]) + indices = self.matcher(enc_outputs, bin_targets) + for loss in self.losses: + l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) + l_dict = {k + "_enc": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class GroundingDINOMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO +class GroundingDINOHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# Copied from transformers.models.detr.modeling_detr._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +# Copied from transformers.models.detr.modeling_detr.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# Copied from transformers.models.detr.modeling_detr.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# Copied from transformers.models.detr.modeling_detr._max_by_axis +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +# Copied from transformers.models.detr.modeling_detr.NestedTensor +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) diff --git a/tests/models/grounding_dino/__init__.py b/tests/models/grounding_dino/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py new file mode 100644 index 00000000000000..3007eef6399916 --- /dev/null +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -0,0 +1,673 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Grounding DINO model. """ + + +import inspect +import math +import unittest +from typing import Dict, List, Tuple + +from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.file_utils import cached_property +from transformers.testing_utils import ( + require_timm, + require_torch, + require_torch_gpu, + require_vision, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GroundingDINOForObjectDetection, GroundingDINOModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class GroundingDINOModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + image_size=196, + n_targets=8, + num_labels=91, + num_feature_levels=4, + encoder_n_points=2, + decoder_n_points=6, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.image_size = image_size + self.n_targets = n_targets + self.num_labels = num_labels + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = ( + math.ceil(self.image_size / 8) ** 2 + + math.ceil(self.image_size / 16) ** 2 + + math.ceil(self.image_size / 32) ** 2 + + math.ceil(self.image_size / 64) ** 2 + ) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return GroundingDINOConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + num_feature_levels=self.num_feature_levels, + encoder_n_points=self.encoder_n_points, + decoder_n_points=self.decoder_n_points, + use_timm_backbone=False, + backbone_config=resnet_config, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels): + model = GroundingDINOModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) + + def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = GroundingDINOForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else () + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GroundingDINOForObjectDetection": + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.image_size, + self.model_tester.image_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = GroundingDINOModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False) + + def test_config(self): + # we don't test common_properties and arguments_init as these don't apply for Grounding DINO + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + + def test_grounding_dino_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs) + + def test_grounding_dino_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="Grounding DINO does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="Grounding DINO is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="Grounding DINO does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + out_len = len(outputs) + + correct_outlen = 8 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "GroundingDINOForObjectDetection": + correct_outlen += 2 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.decoder_n_points, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + # we take the second output since last_hidden_state is the second item + output = outputs[1] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "GroundingDINOForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if param.requires_grad: + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_two_stage_training(self): + model_class = GroundingDINOForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + config.two_stage = True + config.auxiliary_loss = True + config.with_box_refine = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class GroundingDINOModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None + + def test_inference_object_detection_head(self): + model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device) + expected_labels = [17, 17, 75, 75, 63] + expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4)) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes)) + + def test_inference_object_detection_head_with_box_refine_two_stage(self): + model = GroundingDINOForObjectDetection.from_pretrained( + "SenseTime/deformable-detr-with-box-refine-two-stage" + ).to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) + + @require_torch_gpu + def test_inference_object_detection_head_equivalence_cpu_gpu(self): + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt") + pixel_values = encoding["pixel_values"] + pixel_mask = encoding["pixel_mask"] + + # 1. run model on CPU + model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale") + + with torch.no_grad(): + cpu_outputs = model(pixel_values, pixel_mask) + + # 2. run model on GPU + model.to("cuda") + + with torch.no_grad(): + gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda")) + + # 3. assert equivalence + for key in cpu_outputs.keys(): + assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4) + + expected_logits = torch.tensor( + [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + ) + assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4) From 149b462e673c6735c86198ed21c2470893a7d221 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 23 Aug 2023 12:25:43 -0300 Subject: [PATCH 041/252] First commit --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/tasks/object_detection.md | 2 +- .../configuration_grounding_dino.py | 6 +- .../convert_grounding_dino_to_hf.py | 242 ++++++++++++++++++ .../convert_grounding_dino_to_pytorch.py | 237 ----------------- .../grounding_dino/modeling_grounding_dino.py | 4 +- src/transformers/utils/dummy_pt_objects.py | 24 ++ 13 files changed, 279 insertions(+), 243 deletions(-) create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py diff --git a/README.md b/README.md index 853353ecc379cc..3311a4785b54d7 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_es.md b/README_es.md index e74485a2fcccdd..e5497cdd9cd8f6 100644 --- a/README_es.md +++ b/README_es.md @@ -350,6 +350,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_hd.md b/README_hd.md index 96c70ce393d66c..7e85a8c53d1713 100644 --- a/README_hd.md +++ b/README_hd.md @@ -322,6 +322,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। diff --git a/README_ja.md b/README_ja.md index 55fc6b3cedd230..8f347bdd79264e 100644 --- a/README_ja.md +++ b/README_ja.md @@ -384,6 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました. 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) diff --git a/README_ko.md b/README_ko.md index 60a46aefe51b05..31418f42b8a9ff 100644 --- a/README_ko.md +++ b/README_ko.md @@ -299,6 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 7b55646bb27dd2..107ed00f3de87f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -323,6 +323,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 15f56c66889e0c..a633740b292821 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -335,6 +335,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 7511ee66dd0b99..8ed9da455bf7ba 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0b3ae3d74d3475..23cd86fd3f9d44 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -22,7 +22,7 @@ logger = logging.get_logger(__name__) GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "idea-research/grg-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", + "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } @@ -151,8 +151,8 @@ class GroundingDINOConfig(PretrainedConfig): def __init__( self, - use_timm_backbone=True, - backbone_config=None, + use_timm_backbone=False, + backbone_config={"model_type": "swin"}, num_channels=3, num_queries=300, max_position_embeddings=1024, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py new file mode 100644 index 00000000000000..b5de1d8a652c0e --- /dev/null +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -0,0 +1,242 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert GroundingDINO SimMIM checkpoints from the original repository. + +URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" + +import argparse + +import requests +import torch +from PIL import Image +from torchvision import transforms as T +import torchvision.transforms.functional as F + +from transformers import ( + GroundingDINOConfig, GroundingDINOForObjectDetection +) + +IMAGENET_MEAN = [0.485, 0.456, 0.406] +IMAGENET_STD = [0.229, 0.224, 0.225] + + +def get_grounding_dino_config(model_name): + config = GroundingDINOConfig() + + if "tiny" in model_name: + window_size = 7 + embed_dim = 96 + depths = (2, 2, 6, 2) + num_heads = (3, 6, 12, 24) + image_size = 224 + elif "base" in model_name: + window_size = 12 + embed_dim = 128 + depths = (2, 2, 18, 2) + num_heads = (4, 8, 16, 32) + image_size = 384 + else: + raise ValueError("Model not supported, only supports base and large variants") + + config.backbone_config.window_size = window_size + config.backbone_config.image_size = image_size + config.backbone_config.embed_dim = embed_dim + config.backbone_config.depths = depths + config.backbone_config.num_heads = num_heads + config.backbone_config.out_indices = [2, 3, 4] + + return config + + +def create_rename_keys(config): + rename_keys = [] + # fmt: off + #TODO names might change after modifing GroundingDINOModel class + ########################################## VISION BACKBONE - START + # patch embedding layer + rename_keys.append(("module.backbone.0.patch_embed.proj.weight", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("module.backbone.0.patch_embed.proj.bias", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) + rename_keys.append(("module.backbone.0.patch_embed.norm.weight", + "model.backbone.conv_encoder.model.embeddings.norm.weight")) + rename_keys.append(("module.backbone.0.patch_embed.norm.bias", + "model.backbone.conv_encoder.model.embeddings.norm.bias")) + + for layer, depth in enumerate(config.backbone_config.depths): + for block in range(depth): + # layernorms + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) + + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) + # attention + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) + # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) + # intermidiate + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) + + # output + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) + + # downsample + if layer!=len(config.backbone_config.depths)-1: + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) + rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) + + for out_indice in config.backbone_config.out_indices: + # Grounding DINO implementation of out_indices isn't aligned with transformers + rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) + rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) + + ########################################## VISION BACKBONE - END + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + ########################################## VISION BACKBONE - START + embed_dim = config.backbone_config.embed_dim + for layer, depth in enumerate(config.backbone_config.depths): + hidden_size = embed_dim * 2**layer + for block in range(depth): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] + + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :] + state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :] + ########################################## VISION BACKBONE - END + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + +@torch.no_grad() +def convert_grounding_dino_checkpoint(model_name, checkpoint_path): + #Define default GroundingDINO configuation + config = get_grounding_dino_config(model_name) + + # Load original checkpoint + original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + + # Rename keys + new_state_dict = original_state_dict.copy() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(new_state_dict, src, dest) + read_in_q_k_v(new_state_dict, config) + + # Load HF implementation with default config and converted state dict + model = GroundingDINOForObjectDetection(config).eval() + model.load_state_dict(new_state_dict, strict=False) + + # Load and process test image + image = prepare_img() + image_processor = T.Compose( + [ + T.Resize(size=800, max_size=1333), + T.ToTensor(), + T.Normalize(IMAGENET_MEAN, IMAGENET_STD) + ] + ) + inputs = image_processor(image) + pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) + output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + for feature_map in output.feature_maps: + print(f"{feature_map.shape}") + print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") + + # outputs = model(**inputs).logits + + # print(outputs.keys()) + # print("Looks ok!") + + # if pytorch_dump_folder_path is not None: + # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + # model.save_pretrained(pytorch_dump_folder_path) + + # print(f"Saving image processor to {pytorch_dump_folder_path}") + # image_processor.save_pretrained(pytorch_dump_folder_path) + + # if push_to_hub: + # print(f"Pushing model and image processor for {model_name} to hub") + # model.push_to_hub(f"microsoft/{model_name}") + # image_processor.push_to_hub(f"microsoft/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="grounding-dino-tiny", + type=str, + choices=["grounding-dino-tiny", "grounding-dino-base"], + help="Name of the GroundingDINO model you'd like to convert.", + ) + parser.add_argument( + "--checkpoint_path", + default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth", + type=str, + help="Path to the original PyTorch checkpoint (.pth file).", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path) \ No newline at end of file diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py deleted file mode 100644 index d3cef0366b2bca..00000000000000 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_pytorch.py +++ /dev/null @@ -1,237 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Grounding DINO checkpoints.""" - - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import cached_download, hf_hub_url -from PIL import Image - -from transformers import GroundingDINOConfig, GroundingDINOForObjectDetection, DeformableDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_key(orig_key): - if "backbone.0.body" in orig_key: - orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") - if "transformer" in orig_key: - orig_key = orig_key.replace("transformer.", "") - if "norm1" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm1", "self_attn_layer_norm") - else: - orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") - if "norm2" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm2", "final_layer_norm") - else: - orig_key = orig_key.replace("norm2", "self_attn_layer_norm") - if "norm3" in orig_key: - orig_key = orig_key.replace("norm3", "final_layer_norm") - if "linear1" in orig_key: - orig_key = orig_key.replace("linear1", "fc1") - if "linear2" in orig_key: - orig_key = orig_key.replace("linear2", "fc2") - if "query_embed" in orig_key: - orig_key = orig_key.replace("query_embed", "query_position_embeddings") - if "cross_attn" in orig_key: - orig_key = orig_key.replace("cross_attn", "encoder_attn") - - return orig_key - - -def read_in_q_k_v(state_dict): - # transformer decoder self-attention layers - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_grounding_dino_checkpoint( - checkpoint_path, - single_scale, - dilation, - with_box_refine, - two_stage, - pytorch_dump_folder_path, - push_to_hub, -): - """ - Copy/paste/tweak model's weights to our Grounding DINO structure. - """ - - # load default config - config = GroundingDINOConfig() - # set config attributes - if single_scale: - config.num_feature_levels = 1 - config.dilation = dilation - config.with_box_refine = with_box_refine - config.two_stage = two_stage - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - image_processor = DeformableDetrImageProcessor(format="coco_detection") - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "model." - for key in state_dict.copy().keys(): - if not key.startswith("class_embed") and not key.startswith("bbox_embed"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = GroundingDINOForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - # verify our conversion - outputs = model(pixel_values.to(device)) - - expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ) - expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) - - if single_scale: - expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] - ) - expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) - - if single_scale and dilation: - expected_logits = torch.tensor( - [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] - ) - expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) - - if with_box_refine: - expected_logits = torch.tensor( - [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] - ) - expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) - - if with_box_refine and two_stage: - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ) - expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) - - print("Logits:", outputs.logits[0, :3, :3]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - - print("Everything ok!") - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - model_name = "deformable-detr" - model_name += "-single-scale" if single_scale else "" - model_name += "-dc5" if dilation else "" - model_name += "-with-box-refine" if with_box_refine else "" - model_name += "-two-stage" if two_stage else "" - print("Pushing model to hub...") - model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - type=str, - default="/home/niels/checkpoints/grounding_dino/r50_grounding_dino-checkpoint.pth", - help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", - ) - parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") - parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") - parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") - parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - convert_grounding_dino_checkpoint( - args.checkpoint_path, - args.single_scale, - args.dilation, - args.with_box_refine, - args.two_stage, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index ee80a562e4b851..603bdfdd8e8126 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -122,10 +122,10 @@ def backward(context, grad_output): logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "GroundingDINOConfig" -_CHECKPOINT_FOR_DOC = "idea-research/grg-dino-tiny" +_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny" GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "idea-research/grg-dino-tiny", + "idea-research/grounding-dino-tiny", # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino ] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 2991bca449b3c7..22f24222f67514 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2486,6 +2486,30 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GroundingDINOForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None From 92c31bfa6ae676313b48e88adbf53628167dbb8f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 27 Aug 2023 01:47:21 -0300 Subject: [PATCH 042/252] Added bert to model --- .../configuration_grounding_dino.py | 7 +- .../convert_grounding_dino_to_hf.py | 13 +- .../grounding_dino/modeling_grounding_dino.py | 686 +++++++++++++++++- 3 files changed, 692 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 23cd86fd3f9d44..9025d01e725561 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -16,7 +16,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -44,6 +44,8 @@ class GroundingDINOConfig(PretrainedConfig): backbone_config (`PretrainedConfig` or `dict`, *optional*): The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which case it will default to `ResNetConfig()`. + text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`): + The configuration of the text backbone model. Should be a bert-like config. num_channels (`int`, *optional*, defaults to 3): The number of input channels. num_queries (`int`, *optional*, defaults to 300): @@ -153,6 +155,7 @@ def __init__( self, use_timm_backbone=False, backbone_config={"model_type": "swin"}, + text_backbone_config="bert-base-uncased", num_channels=3, num_queries=300, max_position_embeddings=1024, @@ -251,6 +254,8 @@ def __init__( self.eos_coefficient = eos_coefficient self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels + # Text backbone + self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index b5de1d8a652c0e..d5b07b32c3f49f 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -60,7 +60,7 @@ def get_grounding_dino_config(model_name): return config -def create_rename_keys(config): +def create_rename_keys(state_dict, config): rename_keys = [] # fmt: off #TODO names might change after modifing GroundingDINOModel class @@ -126,10 +126,14 @@ def create_rename_keys(config): ########################################## VISION BACKBONE - END + ########################################## TEXT BACKBONE - START + for layer_name, params in state_dict.items(): + if "module.bert" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) + ########################################## TEXT BACKBONE - END # fmt: on return rename_keys - def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val @@ -172,7 +176,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Rename keys new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(config) + rename_keys = create_rename_keys(original_state_dict, config) for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) @@ -192,7 +196,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) inputs = image_processor(image) pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) - output= model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + for feature_map in output.feature_maps: print(f"{feature_map.shape}") print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 603bdfdd8e8126..8bea6eee50096e 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -19,7 +19,7 @@ import math import warnings from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -39,8 +39,13 @@ replace_return_docstrings, requires_backends, ) -from ...modeling_outputs import BaseModelOutput +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPoolingAndCrossAttentions, + BaseModelOutputWithPastAndCrossAttentions +) from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...pytorch_utils import meshgrid from ...utils import is_ninja_available, logging from ..auto import AutoBackbone @@ -173,7 +178,7 @@ class GroundingDINODecoderOutput(ModelOutput): # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOModelOutput(ModelOutput): """ - Base class for outputs of the Grounding DINO encoder-decoder model. + Base class for outputs of the Deformable DETR encoder-decoder model. Args: init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): @@ -432,6 +437,7 @@ def __init__(self, config): if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: parameter.requires_grad_(False) + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): # send pixel_values through the model to get list of feature maps features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps @@ -600,7 +606,7 @@ def multi_scale_deformable_attention( # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOMultiscaleDeformableAttention(nn.Module): """ - Multiscale deformable attention as proposed in Grounding DINO. + Multiscale deformable attention as proposed in Deformable DETR. """ def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): @@ -736,7 +742,7 @@ class GroundingDINOMultiheadAttention(nn.Module): """ Multi-headed attention from 'Attention Is All You Need' paper. - Here, we add position embeddings to the queries and keys (as explained in the Grounding DINO paper). + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). """ def __init__( @@ -1294,7 +1300,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel): The decoder updates the query embeddings through multiple self-attention and cross-attention layers. - Some tweaks for Grounding DINO: + Some tweaks for Deformable DETR: - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. - it also returns a stack of intermediate outputs and reference points from all decoding layers. @@ -1310,7 +1316,7 @@ def __init__(self, config: GroundingDINOConfig): self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.gradient_checkpointing = False - # hack implementation for iterative bounding box refinement and two-stage Grounding DINO + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR self.bbox_embed = None self.class_embed = None @@ -1493,6 +1499,8 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + # Create Text Extractor + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -1772,7 +1780,7 @@ def forward( encoder_outputs[0], ~mask_flatten, spatial_shapes ) - # hack implementation for two-stage Grounding DINO + # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) @@ -1850,7 +1858,7 @@ class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): def __init__(self, config: GroundingDINOConfig): super().__init__(config) - # Grounding DINO encoder-decoder model + # Deformable DETR encoder-decoder model self.model = GroundingDINOModel(config) # Detection heads on top @@ -2178,6 +2186,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes): return losses @torch.no_grad() + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality def loss_cardinality(self, outputs, targets, indices, num_boxes): """ Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. @@ -2193,6 +2202,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes): losses = {"cardinality_error": card_err} return losses + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes def loss_boxes(self, outputs, targets, indices, num_boxes): """ Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. @@ -2217,12 +2227,14 @@ def loss_boxes(self, outputs, targets, indices, num_boxes): losses["loss_giou"] = loss_giou.sum() / num_boxes return losses + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx def _get_source_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) source_idx = torch.cat([source for (source, _) in indices]) return batch_idx, source_idx + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx def _get_target_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) @@ -2511,3 +2523,659 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): else: raise ValueError("Only 3-dimensional tensors are supported") return NestedTensor(tensor, mask) + +# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText +class GroundingDINOTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText +class GroundingDINOTextSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText +class GroundingDINOTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText +class GroundingDINOTextAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = GroundingDINOTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText +class GroundingDINOTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText +class GroundingDINOTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText +class GroundingDINOTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GroundingDINOTextAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute") + self.intermediate = GroundingDINOTextIntermediate(config) + self.output = GroundingDINOTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText +class GroundingDINOTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText +class GroundingDINOTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + +# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText +class GroundingDINOTextModel(nn.Module): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__() + self.config = config + + self.embeddings = GroundingDINOTextEmbeddings(config) + self.encoder = GroundingDINOTextEncoder(config) + + self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) From 8f0a755c18f2d4065a5008bbd4202cbf44aa8a74 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 29 Aug 2023 23:30:53 -0300 Subject: [PATCH 043/252] Bert validated --- .../configuration_grounding_dino.py | 7 +- .../convert_grounding_dino_to_hf.py | 105 ++++++++++++++++-- .../grounding_dino/modeling_grounding_dino.py | 5 +- 3 files changed, 106 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 9025d01e725561..0b4df30f6ee46f 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -177,7 +177,7 @@ def __init__( return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", - backbone="resnet50", + backbone="swin", use_pretrained_backbone=True, dilation=False, num_feature_levels=4, @@ -196,6 +196,9 @@ def __init__( eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, + #other parameters + max_text_len = 256, + sub_sentence_present = True, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -256,6 +259,8 @@ def __init__( self.disable_custom_kernels = disable_custom_kernels # Text backbone self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) + self.max_text_len = max_text_len + self.sub_sentence_present = sub_sentence_present super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index d5b07b32c3f49f..d5ebc9281b8733 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -25,7 +25,7 @@ import torchvision.transforms.functional as F from transformers import ( - GroundingDINOConfig, GroundingDINOForObjectDetection + GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer ) IMAGENET_MEAN = [0.485, 0.456, 0.406] @@ -166,6 +166,88 @@ def prepare_img(): image = Image.open(requests.get(url, stream=True).raw).convert("RGB") return image +def text_processor(text: str, config): + def preprocess_caption(caption: str) -> str: + result = caption.lower().strip() + if result.endswith("."): + return result + return result + "." + def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + ) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + cate_to_token_mask_list = [[] for _ in range(bs)] + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() + c2t_maski[previous_col + 1 : col] = True + cate_to_token_mask_list[row].append(c2t_maski) + previous_col = col + + cate_to_token_mask_list = [ + torch.stack(cate_to_token_mask_listi, dim=0) + for cate_to_token_mask_listi in cate_to_token_mask_list + ] + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.to(torch.long) + tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) + special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + text = preprocess_caption(text) + tokenized = tokenizer([text], padding="longest", return_tensors="pt") + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( + tokenized, special_tokens) + + max_text_len = config.max_text_len + sub_sentence_present = config.sub_sentence_present + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[ + :, : max_text_len, : max_text_len + ] + position_ids = position_ids[:, : max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len] + + # extract text embeddings + if sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + + return tokenized_for_encoder + @torch.no_grad() def convert_grounding_dino_checkpoint(model_name, checkpoint_path): #Define default GroundingDINO configuation @@ -187,6 +269,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Load and process test image image = prepare_img() + text = "a cat" image_processor = T.Compose( [ T.Resize(size=800, max_size=1333), @@ -194,13 +277,21 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): T.Normalize(IMAGENET_MEAN, IMAGENET_STD) ] ) - inputs = image_processor(image) - pixel_mask = torch.ones(((1, inputs.shape[1], inputs.shape[2])), dtype=torch.long, device=inputs.device) - output = model.model.backbone.conv_encoder.model(pixel_values=inputs.unsqueeze(0)) + image_inputs = image_processor(image) + text_inputs = text_processor(text, config) + + pixel_mask = torch.ones( + ((1, image_inputs.shape[1], image_inputs.shape[2])), + dtype=torch.long, + device=image_inputs.device + ) + # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0)) + output = model.model.text_backbone(**text_inputs) + print(output.last_hidden_state[:, :, :5]) - for feature_map in output.feature_maps: - print(f"{feature_map.shape}") - print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") + # for feature_map in output.last_hidden_state: + # print(f"{feature_map.shape}") + # print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") # outputs = model(**inputs).logits diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8bea6eee50096e..ebe151de480211 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -3014,7 +3014,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output # Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText -class GroundingDINOTextModel(nn.Module): +class GroundingDINOTextModel(PreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of @@ -3028,8 +3028,7 @@ class GroundingDINOTextModel(nn.Module): """ def __init__(self, config, add_pooling_layer=True): - super().__init__() - self.config = config + super().__init__(config) self.embeddings = GroundingDINOTextEmbeddings(config) self.encoder = GroundingDINOTextEncoder(config) From fb1c55c3d9ad42769ba7c16e6ab2643fa264a21c Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 31 Aug 2023 20:03:28 -0300 Subject: [PATCH 044/252] Created Text and Fusion layers for Encoder --- .../configuration_grounding_dino.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 309 +++++++++++++++++- 2 files changed, 306 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0b4df30f6ee46f..e77d4be247b746 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -160,7 +160,7 @@ def __init__( num_queries=300, max_position_embeddings=1024, encoder_layers=6, - encoder_ffn_dim=1024, + encoder_ffn_dim=2048, encoder_attention_heads=8, decoder_layers=6, decoder_ffn_dim=1024, diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index ebe151de480211..731172570c23d2 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -854,6 +854,304 @@ def forward( return attn_output, attn_weights_reshaped +# Repeting some code to avoid convert nn.MultiheadAttention later +class GroundingDINOEncoderTextLayer(nn.Module): + def __init__( + self, + embed_dim, + num_heads, + ffn_dim: int, + dropout: float = 0.0, + bias: bool = True, + activation: str = 'relu' + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) + # Implementation of Feedforward model + self.fc1 = nn.Linear(embed_dim, ffn_dim) + self.dropout = nn.Dropout(dropout) + self.fc2 = nn.Linear(ffn_dim, embed_dim) + + self.layer_norm_before = nn.LayerNorm(embed_dim) + self.layer_norm_after = nn.LayerNorm(embed_dim) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = ACT2FN[activation] + self.num_heads = num_heads + + def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): + return hidden_state if position_embeddings is None else hidden_state + position_embeddings + + def forward( + self, + hidden_states: Tensor, + attention_masks: Optional[Tensor] = None, + position_embeddings: Optional[Tensor] = None, + ): # repeat attn mask + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]: + # bs, num_q, num_k + attention_masks = attention_masks.repeat(self.num_heads, 1, 1) + + q = k = self.with_pos_embed(hidden_states, position_embeddings) + attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0] + + hidden_states = hidden_states + self.dropout1(attention_output) + hidden_states = self.layer_norm_before(hidden_states) + hidden_states = self.activation(self.fc1(hidden_states)) + attention_output = self.fc2(self.dropout(hidden_states)) + hidden_states = hidden_states + self.dropout2(attention_output) + hidden_states = self.layer_norm_after(hidden_states) + return hidden_states + +class BiMultiHeadAttention(nn.Module): + def __init__( + self, + vision_dim: int, + text_dim: int, + embed_dim: int, + num_heads: int, + dropout:float = 0.1 + ): + super().__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.vision_dim = vision_dim + self.text_dim = text_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.text_proj = nn.Linear(self.text_dim, self.embed_dim) + self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim) + + self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) + self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.vision_proj.weight) + self.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.text_proj.weight) + self.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_vision_proj.weight) + self.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_text_proj.weight) + self.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_vision_proj.weight) + self.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_text_proj.weight) + self.out_text_proj.bias.data.fill_(0) + + def forward( + self, + vision_features: Tensor, + text_features: Tensor, + vision_attention_mask: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None + ): + """_summary_ + + Args: + vision_features Tensor: bs, n_img, dim + text_features Tensor: bs, n_text, dim + vision_attention_mask (Tensor, optional): _description_. bs, n_img + text_attention_mask (Tensor, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + bsz, tgt_len, _ = vision_features.size() + + vision_query_states = self.vision_proj(vision_features) * self.scale + vision_query_states = self._shape(vision_query_states, tgt_len, bsz) + + text_key_states = self.text_proj(text_features) + text_key_states = self._shape(text_key_states, -1, bsz) + + vision_value_states = self.values_vision_proj(vision_features) + vision_value_states = self._shape(vision_value_states, -1, bsz) + + text_value_states = self.values_text_proj(text_features) + text_value_states = self._shape(text_value_states, -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + + vision_query_states = vision_query_states.view(*proj_shape) + text_key_states = text_key_states.view(*proj_shape) + vision_value_states = vision_value_states.view(*proj_shape) + text_value_states = text_value_states.view(*proj_shape) + + src_len = text_key_states.size(1) + attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + attn_weights = attn_weights - attn_weights.max() + + attn_weights = torch.clamp( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + attn_weights = torch.clamp( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + + text_attn_weights = torch.clamp( + text_attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + text_attn_weights = torch.clamp( + text_attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if vision_attention_mask is not None: + vision_attention_mask = ( + vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + text_attn_weights.masked_fill_(vision_attention_mask, float("-inf")) + + text_attn_weights = text_attn_weights.softmax(dim=-1) + + # mask language for vision + if text_attention_mask is not None: + text_attention_mask = ( + text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights.masked_fill_(text_attention_mask, float("-inf")) + vision_attn_weights = attn_weights.softmax(dim=-1) + + vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training) + text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training) + + vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) + text_attn_output = torch.bmm(text_attn_probs, vision_value_states) + + if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + ) + + if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + ) + + vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.transpose(1, 2) + vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim) + + text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.transpose(1, 2) + text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim) + + vision_attn_output = self.out_vision_proj(vision_attn_output) + text_attn_output = self.out_text_proj(text_attn_output) + + return vision_attn_output, text_attn_output + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO +class GroundingDINODropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + +class GroundingDINOBiAttention(nn.Module): + def __init__( + self, + vision_dim, + text_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super().__init__() + + # pre layer norm + self.layer_norm_vision = nn.LayerNorm(vision_dim) + self.layer_norm_text = nn.LayerNorm(text_dim) + self.attn = BiMultiHeadAttention( + vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True) + + def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): + vision_features = self.layer_norm_vision(vision_features) + text_features = self.layer_norm_text(text_features) + delta_v, delta_l = self.attn( + vision_features, + text_features, + attention_mask_vision=attention_mask_vision, + attention_mask_text=attention_mask_text + ) + # vision_features, text_features = vision_features + delta_v, text_features + delta_l + vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) + text_features = text_features + self.drop_path(self.gamma_l * delta_l) + return vision_features, text_features # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO class GroundingDINOEncoderLayer(nn.Module): @@ -1499,8 +1797,6 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) - # Create Text Extractor - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -1850,7 +2146,6 @@ def forward( """, GROUNDING_DINO_START_DOCSTRING, ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] @@ -1866,6 +2161,7 @@ def __init__(self, config: GroundingDINOConfig): self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) @@ -2588,6 +2884,8 @@ def forward( embeddings = self.dropout(embeddings) return embeddings +# Classes for Text Backbone (It's just a BERT model) + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText class GroundingDINOTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3013,7 +3311,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: pooled_output = self.activation(pooled_output) return pooled_output -# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->GroundingDINOText class GroundingDINOTextModel(PreTrainedModel): """ @@ -3029,12 +3326,16 @@ class GroundingDINOTextModel(PreTrainedModel): def __init__(self, config, add_pooling_layer=True): super().__init__(config) + self.config = config self.embeddings = GroundingDINOTextEmbeddings(config) self.encoder = GroundingDINOTextEncoder(config) self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None + # Initialize weights and apply final processing + self.post_init() + def get_input_embeddings(self): return self.embeddings.word_embeddings From 86131aff2aee36051ce1a9fef81fa552152aea12 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 31 Aug 2023 20:59:26 -0300 Subject: [PATCH 045/252] Adapted Encoder layer --- .../configuration_grounding_dino.py | 8 + .../grounding_dino/modeling_grounding_dino.py | 180 +++++++++++++----- 2 files changed, 137 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e77d4be247b746..3abf4912ebb651 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -199,6 +199,9 @@ def __init__( #other parameters max_text_len = 256, sub_sentence_present = True, + text_enhancer_dropout = 0.0, + fusion_droppath = 0.1, + fusion_dropout = 0.0, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -261,6 +264,11 @@ def __init__( self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) self.max_text_len = max_text_len self.sub_sentence_present = sub_sentence_present + # Text Enhancer + self.text_enhancer_dropout = text_enhancer_dropout + # Fusion + self.fusion_droppath = fusion_droppath + self.fusion_dropout = fusion_dropout super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 731172570c23d2..91129946c6141e 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -855,30 +855,28 @@ def forward( return attn_output, attn_weights_reshaped # Repeting some code to avoid convert nn.MultiheadAttention later -class GroundingDINOEncoderTextLayer(nn.Module): - def __init__( - self, - embed_dim, - num_heads, - ffn_dim: int, - dropout: float = 0.0, - bias: bool = True, - activation: str = 'relu' - ): +#TODO is this an approriate way to name this? +class GroundingDINOTextEnhancerLayer(nn.Module): + """Vanilla Transformer with text embeddings as input""" + def __init__(self, config): super().__init__() - self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) + self.self_attn = nn.MultiheadAttention( + embed_dim=config.d_model, + num_heads=config.num_heads // 2, + dropout=config.text_enhancer_dropout + ) # Implementation of Feedforward model - self.fc1 = nn.Linear(embed_dim, ffn_dim) - self.dropout = nn.Dropout(dropout) - self.fc2 = nn.Linear(ffn_dim, embed_dim) + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) + self.dropout = nn.Dropout(config.text_enhancer_dropout) + self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) - self.layer_norm_before = nn.LayerNorm(embed_dim) - self.layer_norm_after = nn.LayerNorm(embed_dim) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) + self.layer_norm_before = nn.LayerNorm(config.d_model) + self.layer_norm_after = nn.LayerNorm(config.d_model) + self.dropout1 = nn.Dropout(config.text_enhancer_dropout) + self.dropout2 = nn.Dropout(config.text_enhancer_dropout) - self.activation = ACT2FN[activation] - self.num_heads = num_heads + self.activation = ACT2FN[config.activation_fuction] + self.num_heads = config.num_heads // 2 def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -903,8 +901,8 @@ def forward( hidden_states = hidden_states + self.dropout2(attention_output) hidden_states = self.layer_norm_after(hidden_states) return hidden_states - -class BiMultiHeadAttention(nn.Module): + +class GroundingDINOBiMultiHeadAttention(nn.Module): def __init__( self, vision_dim: int, @@ -1106,38 +1104,26 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -class GroundingDINOBiAttention(nn.Module): - def __init__( - self, - vision_dim, - text_dim, - embed_dim, - num_heads, - dropout=0.1, - drop_path=0.0, - init_values=1e-4, - ): - """ - Inputs: - embed_dim - Dimensionality of input and attention feature vectors - hidden_dim - Dimensionality of hidden layer in feed-forward network - (usually 2-4x larger than embed_dim) - num_heads - Number of heads to use in the Multi-Head Attention block - dropout - Amount of dropout to apply in the feed-forward network - """ +class GroundingDINOFusionLayer(nn.Module): + def __init__(self, config, init_values=1e-4): super().__init__() + drop_path = config.fusion_droppath # pre layer norm - self.layer_norm_vision = nn.LayerNorm(vision_dim) - self.layer_norm_text = nn.LayerNorm(text_dim) - self.attn = BiMultiHeadAttention( - vision_dim=vision_dim, text_dim=text_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + self.layer_norm_vision = nn.LayerNorm(config.d_model) + self.layer_norm_text = nn.LayerNorm(config.d_model) + self.attn = GroundingDINOBiMultiHeadAttention( + vision_dim=config.d_model, + text_dim=config.d_model, + embed_dim=config.encoder_ffn_dim // 2, + num_heads=config.num_heads // 2, + dropout=config.fusion_dropout ) # add layer scale for training stability self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.gamma_v = nn.Parameter(init_values * torch.ones((vision_dim)), requires_grad=True) - self.gamma_l = nn.Parameter(init_values * torch.ones((text_dim)), requires_grad=True) + self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) @@ -1153,8 +1139,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at text_features = text_features + self.drop_path(self.gamma_l * delta_l) return vision_features, text_features -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->GroundingDINO -class GroundingDINOEncoderLayer(nn.Module): +#NOTE just renamed the class +class GroundingDINODeformableLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() self.embed_dim = config.d_model @@ -1238,6 +1224,98 @@ def forward( return outputs +def get_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, + ) -> Tensor: + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res + + +class GroundingDINOEncoderLayer(nn.Module): + def __init__(self, config) -> None: + super().__init_() + self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) + self.fusion_layer = GroundingDINOFusionLayer(config) + self.deformable_layer = GroundingDINODeformableLayer(config) + + def forward( + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None + ): + bs, n_text, text_dim = text_features.shape + if text_position_embedding is None and text_position_ids is None: + pos_text = ( + torch.arange(n_text, device=text_features.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + vision_features, text_features = self.fusion_layer( + vision_features=vision_features, + text_features=text_features, + attention_mask_vision=key_padding_mask, + attention_mask_text=text_attention_mask, + ) + + text_features = self.text_enhancer_layer( + hidden_states=text_features.transpose(0, 1), + attention_masks=~text_self_attention_masks, # note we use ~ for mask here + position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None), + ).transpose(0, 1) + + vision_features = self.deformable_layer( + hidden_states=vision_features, + attention_mask=key_padding_mask, + position_embeddings=vision_position_embedding, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + ) + + return vision_features, text_features + # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO class GroundingDINODecoderLayer(nn.Module): @@ -1788,7 +1866,6 @@ def custom_forward(*inputs): """, GROUNDING_DINO_START_DOCSTRING, ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->GroundingDINO,DEFORMABLE_DETR->GROUNDING_DINO,Deformable DETR->Grounding DINO class GroundingDINOModel(GroundingDINOPreTrainedModel): def __init__(self, config: GroundingDINOConfig): super().__init__(config) @@ -1797,6 +1874,8 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + # Create text backbone + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -2161,7 +2240,6 @@ def __init__(self, config: GroundingDINOConfig): self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) From 8ad3226e297beadd5efdccbbdcadca98989d625e Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 1 Sep 2023 11:37:07 -0300 Subject: [PATCH 046/252] Fixed typos --- .../grounding_dino/modeling_grounding_dino.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 91129946c6141e..984587d3997d67 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -862,7 +862,7 @@ def __init__(self, config): super().__init__() self.self_attn = nn.MultiheadAttention( embed_dim=config.d_model, - num_heads=config.num_heads // 2, + num_heads=config.encoder_attention_heads // 2, dropout=config.text_enhancer_dropout ) # Implementation of Feedforward model @@ -875,8 +875,8 @@ def __init__(self, config): self.dropout1 = nn.Dropout(config.text_enhancer_dropout) self.dropout2 = nn.Dropout(config.text_enhancer_dropout) - self.activation = ACT2FN[config.activation_fuction] - self.num_heads = config.num_heads // 2 + self.activation = ACT2FN[config.activation_function] + self.num_heads = config.encoder_attention_heads // 2 def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -1116,7 +1116,7 @@ def __init__(self, config, init_values=1e-4): vision_dim=config.d_model, text_dim=config.d_model, embed_dim=config.encoder_ffn_dim // 2, - num_heads=config.num_heads // 2, + num_heads=config.encoder_attention_heads // 2, dropout=config.fusion_dropout ) @@ -1258,25 +1258,25 @@ def sine_func(x: torch.Tensor): class GroundingDINOEncoderLayer(nn.Module): def __init__(self, config) -> None: - super().__init_() + super().__init__() self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) self.fusion_layer = GroundingDINOFusionLayer(config) self.deformable_layer = GroundingDINODeformableLayer(config) def forward( - self, - vision_features: Tensor, - vision_position_embedding: Tensor, - spatial_shapes: Tensor, - level_start_index: Tensor, - key_padding_mask: Tensor, - reference_points: Tensor, - text_features: Optional[Tensor] = None, - text_attention_mask: Optional[Tensor] = None, - text_position_embedding: Optional[Tensor] = None, - text_self_attention_masks: Optional[Tensor] = None, - text_position_ids: Optional[Tensor] = None - ): + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None + ): bs, n_text, text_dim = text_features.shape if text_position_embedding is None and text_position_ids is None: pos_text = ( From 21e3fa2f70ee396268c2af1e3774db976aa91075 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 4 Sep 2023 13:08:37 -0300 Subject: [PATCH 047/252] Adjusted Encoder --- .../grounding_dino/modeling_grounding_dino.py | 234 +++++++++++++----- 1 file changed, 176 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 984587d3997d67..229c5d89c716f9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -173,6 +173,55 @@ class GroundingDINODecoderOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None +@dataclass +class GroundingDINOEncoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDINOEncoder. This class extends + BaseModelOutput, due to: + - vision and text last hidden states + - vision and text intermediate hidden states + - vision and text attentions + - vision and text cross attentions + + Args: + last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the vision encoder. + last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the text encoder. + hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer + plus the initial embedding outputs. + hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer + plus the initial embedding outputs. + attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + """ + last_hidden_state_vision: torch.FloatTensor = None + last_hidden_state_text: torch.FloatTensor = None + hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + attentions_text: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + @dataclass # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO @@ -892,7 +941,7 @@ def forward( attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) - attention_output = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks)[0] + attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks) hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) @@ -900,7 +949,7 @@ def forward( attention_output = self.fc2(self.dropout(hidden_states)) hidden_states = hidden_states + self.dropout2(attention_output) hidden_states = self.layer_norm_after(hidden_states) - return hidden_states + return hidden_states, attention_weights class GroundingDINOBiMultiHeadAttention(nn.Module): def __init__( @@ -933,10 +982,6 @@ def __init__( self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) - self.stable_softmax_2d = True - self.clamp_min_for_underflow = True - self.clamp_max_for_overflow = True - self._reset_parameters() def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): @@ -1068,7 +1113,7 @@ def forward( vision_attn_output = self.out_vision_proj(vision_attn_output) text_attn_output = self.out_text_proj(text_attn_output) - return vision_attn_output, text_attn_output + return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) # Copied from transformers.models.beit.modeling_beit.drop_path def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: @@ -1128,16 +1173,16 @@ def __init__(self, config, init_values=1e-4): def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) text_features = self.layer_norm_text(text_features) - delta_v, delta_l = self.attn( + (delta_v, vision_attn), (delta_t, text_attn) = self.attn( vision_features, text_features, attention_mask_vision=attention_mask_vision, attention_mask_text=attention_mask_text ) - # vision_features, text_features = vision_features + delta_v, text_features + delta_l vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) - text_features = text_features + self.drop_path(self.gamma_l * delta_l) - return vision_features, text_features + text_features = text_features + self.drop_path(self.gamma_l * delta_t) + + return (vision_features, vision_attn), (text_features, text_attn) #NOTE just renamed the class class GroundingDINODeformableLayer(nn.Module): @@ -1263,6 +1308,29 @@ def __init__(self, config) -> None: self.fusion_layer = GroundingDINOFusionLayer(config) self.deformable_layer = GroundingDINODeformableLayer(config) + def get_text_position_embeddings( + self, + text_features: Tensor, + text_position_embedding: Tensor, + text_position_ids: Tensor + ) -> Tensor: + bs, n_text, text_dim = text_features.shape + if text_position_embedding is None and text_position_ids is None: + text_position_embedding = ( + torch.arange(n_text, device=text_features.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + return text_position_embedding + def forward( self, vision_features: Tensor, @@ -1277,35 +1345,28 @@ def forward( text_self_attention_masks: Optional[Tensor] = None, text_position_ids: Optional[Tensor] = None ): - bs, n_text, text_dim = text_features.shape - if text_position_embedding is None and text_position_ids is None: - pos_text = ( - torch.arange(n_text, device=text_features.device) - .float() - .unsqueeze(0) - .unsqueeze(-1) - .repeat(bs, 1, 1) - ) - pos_text = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) - if text_position_ids is not None: - text_position_embedding = get_sine_pos_embed( - text_position_ids[..., None], num_pos_feats=256, exchange_xy=False - ) + text_position_embedding = self.get_text_position_embeddings( + text_features, + text_position_embedding, + text_position_ids + ) - vision_features, text_features = self.fusion_layer( + (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( vision_features=vision_features, text_features=text_features, attention_mask_vision=key_padding_mask, attention_mask_text=text_attention_mask, ) - text_features = self.text_enhancer_layer( + (text_features, text_enhanced_attn) = self.text_enhancer_layer( hidden_states=text_features.transpose(0, 1), attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=(pos_text.transpose(0, 1) if pos_text is not None else None), + position_embeddings=( + text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None + ), ).transpose(0, 1) - vision_features = self.deformable_layer( + (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, attention_mask=key_padding_mask, position_embeddings=vision_position_embedding, @@ -1314,7 +1375,10 @@ def forward( level_start_index=level_start_index, ) - return vision_features, text_features + return ( + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn) + ) # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO @@ -1538,7 +1602,6 @@ def _set_gradient_checkpointing(self, module, value=False): """ -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->GroundingDINO class GroundingDINOEncoder(GroundingDINOPreTrainedModel): """ Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a @@ -1592,26 +1655,31 @@ def get_reference_points(spatial_shapes, valid_ratios, device): def forward( self, - inputs_embeds=None, - attention_mask=None, - position_embeddings=None, - spatial_shapes=None, - level_start_index=None, + vision_features: Tensor, + vision_attention_mask: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, valid_ratios=None, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - 1 for pixel features that are real (i.e. **not masked**), - 0 for pixel features that are padding (i.e. **masked**). [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Position embeddings that are added to the queries and keys in each self-attention layer. spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): Spatial shapes of each feature map. @@ -1619,6 +1687,21 @@ def forward( Starting index of each feature map. valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): Ratio of valid area in each feature level. + text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Flattened text features that are passed to the encoder. + text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`): + Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`): + Position ids for text features. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -1634,41 +1717,76 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = inputs_embeds - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + #TODO check if this is necessary according to original implementation + vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training) - reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None + encoder_vision_states = () if output_hidden_states else None + encoder_text_states = () if output_hidden_states else None + all_attn_fused_text = () if output_attentions else None + all_attn_fused_vision = () if output_attentions else None + all_attn_enhanced_text = () if output_attentions else None + all_attn_deformable = () if output_attentions else None for i, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - position_embeddings=position_embeddings, - reference_points=reference_points, + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + # INPUTS FOR ENCODER LAYER + # - vision_features: Tensor, + # - vision_position_embedding: Tensor, + # - spatial_shapes: Tensor, + # - level_start_index: Tensor, + # - key_padding_mask: Tensor, + # - reference_points: Tensor, + # - text_features: Optional[Tensor] = None, + # - text_attention_mask: Optional[Tensor] = None, + # - text_position_embedding: Optional[Tensor] = None, + # - text_self_attention_masks: Optional[Tensor] = None, + # - text_position_ids: Optional[Tensor] = None + (vision_features, text_features), attentions = encoder_layer( + vision_features=vision_features, + vision_position_embedding=vision_position_embedding, spatial_shapes=spatial_shapes, level_start_index=level_start_index, - output_attentions=output_attentions, + key_padding_mask=vision_attention_mask, + reference_points=reference_points, + text_features=text_features, + text_attention_mask=text_attention_mask, + text_position_embedding=text_position_embedding, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=text_position_ids ) - hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) + all_attn_fused_vision += (attentions[0],) + all_attn_fused_text += (attentions[1],) + all_attn_enhanced_text += (attentions[2],) + all_attn_deformable += (attentions[3],) if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + enc_outputs = [ + vision_features, text_features, + all_attn_fused_vision, all_attn_fused_text, + all_attn_enhanced_text, all_attn_deformable + ] + return tuple(v for v in enc_outputs if v is not None) + return GroundingDINOEncoderOutput( + last_hidden_state_vision=vision_features, + last_hidden_state_text=text_features, + hidden_states_vision=encoder_vision_states, + hidden_states_text=encoder_text_states, + cross_attentions_vision=all_attn_fused_vision, + cross_attentions_text=all_attn_fused_text, + attentions_vision=all_attn_deformable, + attentions_text=all_attn_enhanced_text ) - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ From 5ddfa38fdf72b55bd793f3451b48274bdec794b0 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 4 Sep 2023 13:09:56 -0300 Subject: [PATCH 048/252] Converted encoder to hf --- .../configuration_grounding_dino.py | 2 +- .../convert_grounding_dino_to_hf.py | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 3abf4912ebb651..14e82704cb495b 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -157,7 +157,7 @@ def __init__( backbone_config={"model_type": "swin"}, text_backbone_config="bert-base-uncased", num_channels=3, - num_queries=300, + num_queries=900, max_position_embeddings=1024, encoder_layers=6, encoder_ffn_dim=2048, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index d5ebc9281b8733..f9fc7e87d12bba 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -131,6 +131,88 @@ def create_rename_keys(state_dict, config): if "module.bert" in layer_name: rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) ########################################## TEXT BACKBONE - END + + ########################################## ENCODER - START + deformable_key_mappings = { + 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', + 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', + 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', + 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', + 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', + 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', + 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', + 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', + 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', + 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', + 'linear1.weight': 'deformable_layer.fc1.weight', + 'linear1.bias': 'deformable_layer.fc1.bias', + 'linear2.weight': 'deformable_layer.fc2.weight', + 'linear2.bias': 'deformable_layer.fc2.bias', + 'norm2.weight': 'deformable_layer.final_layer_norm.weight', + 'norm2.bias': 'deformable_layer.final_layer_norm.bias', + } + text_enhancer_key_mappings = { + 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', + 'linear1.weight': 'text_enhancer_layer.fc1.weight', + 'linear1.bias': 'text_enhancer_layer.fc1.bias', + 'linear2.weight': 'text_enhancer_layer.fc2.weight', + 'linear2.bias': 'text_enhancer_layer.fc2.bias', + 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', + 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', + 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', + 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', + } + fusion_key_mappings = { + 'gamma_v': 'fusion_layer.gamma_v', + 'gamma_l': 'fusion_layer.gamma_l', + 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', + 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', + 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', + 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', + 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', + 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', + 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', + 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', + 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', + 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', + 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', + 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', + 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', + 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', + 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', + 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', + } + + for layer in range(config.encoder_layers): + # deformable + for src, dest in deformable_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # text enhance + for src, dest in text_enhancer_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # fusion layers + for src, dest in fusion_key_mappings.items(): + rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + ########################################## ENCODER - END + + #TODO convert decoder + ########################################## DECODER - START + ########################################## DECODER - END + + #TODO convert head + ########################################## HEAD - START + ########################################## HEAD - END + + #TODO convert additional layers + ########################################## Additional - START + ########################################## Additional - END + # fmt: on return rename_keys @@ -259,6 +341,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): # Rename keys new_state_dict = original_state_dict.copy() rename_keys = create_rename_keys(original_state_dict, config) + for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) From 0512f7a286d311617a222b77b841c6835a19b3aa Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 5 Sep 2023 16:10:51 -0300 Subject: [PATCH 049/252] Modified Decoder Layer --- .../grounding_dino/modeling_grounding_dino.py | 51 ++++++++++++++----- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 229c5d89c716f9..9f6edac849f2c9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1388,7 +1388,7 @@ def __init__(self, config: GroundingDINOConfig): self.embed_dim = config.d_model # self-attention - self.self_attn = GroundingDINOMultiheadAttention( + self.self_attn = nn.MultiheadAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, @@ -1398,6 +1398,13 @@ def __init__(self, config: GroundingDINOConfig): self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention text + self.encoder_attn_text = nn.MultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention self.encoder_attn = GroundingDINOMultiscaleDeformableAttention( config, @@ -1410,6 +1417,9 @@ def __init__(self, config: GroundingDINOConfig): self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim) + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + def forward( self, hidden_states: torch.Tensor, @@ -1417,8 +1427,11 @@ def forward( reference_points=None, spatial_shapes=None, level_start_index=None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, + vision_encoder_hidden_states: Optional[torch.Tensor] = None, + vision_encoder_attention_mask: Optional[torch.Tensor] = None, + text_encoder_hidden_states: Optional[torch.Tensor] = None, + text_encoder_attention_mask: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ): """ @@ -1446,9 +1459,10 @@ def forward( # Self Attention hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - position_embeddings=position_embeddings, - output_attentions=output_attentions, + query=self.with_pos_embed(hidden_states, position_embeddings), + key=self.with_pos_embed(hidden_states, position_embeddings), + value=hidden_states, + attn_mask=self_attn_mask ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1457,13 +1471,27 @@ def forward( second_residual = hidden_states + # Cross-Attention Text + hidden_states, text_cross_attn_weights = self.encoder_attn_text( + query=self.with_pos_embed(hidden_states, position_embeddings), + key=text_encoder_hidden_states.transpose(0, 1), + value=text_encoder_hidden_states.transpose(0, 1), + attn_mask=text_encoder_attention_mask, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + hidden_states = self.encoder_attn_text_layer_norm(hidden_states) + + third_residual = hidden_states + # Cross-Attention cross_attn_weights = None hidden_states, cross_attn_weights = self.encoder_attn( hidden_states=hidden_states, - attention_mask=encoder_attention_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, + attention_mask=vision_encoder_attention_mask, + encoder_hidden_states=vision_encoder_hidden_states, + encoder_attention_mask=vision_encoder_attention_mask, position_embeddings=position_embeddings, reference_points=reference_points, spatial_shapes=spatial_shapes, @@ -1472,8 +1500,7 @@ def forward( ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = second_residual + hidden_states - + hidden_states = third_residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) # Fully Connected @@ -1488,7 +1515,7 @@ def forward( outputs = (hidden_states,) if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) + outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights) return outputs From d2cd35f204b12257250ed7db9f004af93b7dfc7b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 6 Sep 2023 14:33:57 -0300 Subject: [PATCH 050/252] Modified main decoder class --- .../configuration_grounding_dino.py | 6 +-- .../convert_grounding_dino_to_hf.py | 37 ++++++++++++++ .../grounding_dino/modeling_grounding_dino.py | 49 +++++++++++++------ 3 files changed, 73 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 14e82704cb495b..33de7c666cef19 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -163,7 +163,7 @@ def __init__( encoder_ffn_dim=2048, encoder_attention_heads=8, decoder_layers=6, - decoder_ffn_dim=1024, + decoder_ffn_dim=2048, decoder_attention_heads=8, encoder_layerdrop=0.0, is_encoder_decoder=True, @@ -183,9 +183,9 @@ def __init__( num_feature_levels=4, encoder_n_points=4, decoder_n_points=4, - two_stage=False, + two_stage=True, two_stage_num_proposals=300, - with_box_refine=False, + with_box_refine=True, class_cost=1, bbox_cost=5, giou_cost=2, diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index f9fc7e87d12bba..846892980d2d21 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -203,6 +203,43 @@ def create_rename_keys(state_dict, config): #TODO convert decoder ########################################## DECODER - START + key_mappings_decoder = { + 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', + 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', + 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', + 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', + 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', + 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', + 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', + 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', + 'norm1.weight': 'encoder_attn_layer_norm.weight', + 'norm1.bias': 'encoder_attn_layer_norm.bias', + 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', + 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', + 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', + 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', + 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', + 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', + 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', + 'norm2.weight': 'self_attn_layer_norm.weight', + 'norm2.bias': 'self_attn_layer_norm.bias', + 'linear1.weight': 'fc1.weight', + 'linear1.bias': 'fc1.bias', + 'linear2.weight': 'fc2.weight', + 'linear2.bias': 'fc2.bias', + 'norm3.weight': 'final_layer_norm.weight', + 'norm3.bias': 'final_layer_norm.bias', + } + for layer_num in range(config.decoder_layers): + source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.' + target_prefix_decoder = f'model.decoder.layers.{layer_num}.' + + for source_name, target_name in key_mappings_decoder.items(): + rename_keys.append((source_prefix_decoder + source_name, + target_prefix_decoder + target_name)) ########################################## DECODER - END #TODO convert head diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 9f6edac849f2c9..d57e823199703a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -160,10 +160,14 @@ class GroundingDINODecoderOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. + text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the text cross-attention heads. """ last_hidden_state: torch.FloatTensor = None @@ -171,7 +175,8 @@ class GroundingDINODecoderOutput(ModelOutput): intermediate_reference_points: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None @dataclass class GroundingDINOEncoderOutput(ModelOutput): @@ -1814,7 +1819,6 @@ def forward( attentions_text=all_attn_enhanced_text ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. @@ -1840,20 +1844,24 @@ def __init__(self, config: GroundingDINOConfig): # hack implementation for iterative bounding box refinement and two-stage Deformable DETR self.bbox_embed = None self.class_embed = None + self.query_scale = None # Initialize weights and apply final processing self.post_init() def forward( self, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, + inputs_embeds, + vision_encoder_hidden_states, + vision_encoder_attention_mask=None, + text_encoder_hidden_states=None, + text_encoder_attention_mask=None, position_embeddings=None, reference_points=None, spatial_shapes=None, level_start_index=None, valid_ratios=None, + self_attn_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, @@ -1902,7 +1910,8 @@ def forward( # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None + all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None intermediate = () intermediate_reference_points = () @@ -1930,20 +1939,23 @@ def custom_forward(*inputs): layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, - encoder_hidden_states, - encoder_attention_mask, + vision_encoder_hidden_states, + vision_encoder_attention_mask, None, ) else: layer_outputs = decoder_layer( - hidden_states, + hidden_states=hidden_states, position_embeddings=position_embeddings, - encoder_hidden_states=encoder_hidden_states, reference_points=reference_points_input, spatial_shapes=spatial_shapes, level_start_index=level_start_index, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, + vision_encoder_hidden_states=vision_encoder_hidden_states, + vision_encoder_attention_mask=vision_encoder_attention_mask, + text_encoder_hidden_states=text_encoder_hidden_states, + text_encoder_attention_mask=text_encoder_attention_mask, + self_attn_mask=self_attn_mask, + output_attentions=output_attentions ) hidden_states = layer_outputs[0] @@ -1970,8 +1982,12 @@ def custom_forward(*inputs): if output_attentions: all_self_attns += (layer_outputs[1],) - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) + if text_encoder_hidden_states is not None: + all_cross_attns_text += (layer_outputs[2],) + + if vision_encoder_hidden_states is not None: + all_cross_attns_vision += (layer_outputs[3],) + # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) @@ -2000,7 +2016,8 @@ def custom_forward(*inputs): intermediate_reference_points=intermediate_reference_points, hidden_states=all_hidden_states, attentions=all_self_attns, - cross_attentions=all_cross_attentions, + vision_cross_attentions=all_cross_attns_vision, + text_cross_attentions=all_cross_attns_text ) From cb2ad7f51fc32eab1274909fed59811289d2e34e Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 6 Sep 2023 14:38:56 -0300 Subject: [PATCH 051/252] Removed copy comments --- .../models/grounding_dino/modeling_grounding_dino.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index d57e823199703a..8cd584c1fcc71c 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -137,7 +137,6 @@ def backward(context, grad_output): @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->GroundingDINO class GroundingDINODecoderOutput(ModelOutput): """ Base class for outputs of the GroundingDINODecoder. This class adds two attributes to @@ -1153,7 +1152,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) - class GroundingDINOFusionLayer(nn.Module): def __init__(self, config, init_values=1e-4): super().__init__() @@ -1386,7 +1384,6 @@ def forward( ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->GroundingDINO class GroundingDINODecoderLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() @@ -2006,7 +2003,8 @@ def custom_forward(*inputs): intermediate_reference_points, all_hidden_states, all_self_attns, - all_cross_attentions, + all_cross_attns_vision, + all_cross_attns_text ] if v is not None ) From eaf958d1d3eb38c411d731cf7646cd30bcf22262 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 10 Sep 2023 23:21:17 -0300 Subject: [PATCH 052/252] Fixed forward from GroundingDINOModel and GroundingDINODecoder --- .../configuration_grounding_dino.py | 14 ++ .../convert_grounding_dino_to_hf.py | 9 + .../grounding_dino/modeling_grounding_dino.py | 190 +++++++++++++----- 3 files changed, 162 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 33de7c666cef19..bc43655df050ee 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -130,6 +130,18 @@ class GroundingDINOConfig(PretrainedConfig): disable_custom_kernels (`bool`, *optional*, defaults to `False`): Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom kernels are not supported by PyTorch ONNX export. + max_text_len (`int`, *optional*, defaults to 256): + The maximum length of the text input. + sub_sentence_present (`bool`, *optional*, defaults to `True`): + Whether to use sub-sentence present in the text input. + text_enhancer_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the text enhancer. + fusion_droppath (`float`, *optional*, defaults to 0.1): + The droppath ratio for the fusion module. + fusion_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the fusion module. + embedding_init_target (`bool`, *optional*, defaults to `True`): + Whether to initialize the target with Embedding weights. Examples: @@ -202,6 +214,7 @@ def __init__( text_enhancer_dropout = 0.0, fusion_droppath = 0.1, fusion_dropout = 0.0, + embedding_init_target = True, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -269,6 +282,7 @@ def __init__( # Fusion self.fusion_droppath = fusion_droppath self.fusion_dropout = fusion_dropout + self.embedding_init_target = embedding_init_target super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 846892980d2d21..efced9cba0d522 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -248,6 +248,15 @@ def create_rename_keys(state_dict, config): #TODO convert additional layers ########################################## Additional - START + for layer_name, params in state_dict.items(): + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE + if "module.input_proj" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE + if "module.feat_map" in layer_name: + rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text"))) + #### + ########################################## Additional - END # fmt: on diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8cd584c1fcc71c..35ed14fa6859bc 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1521,6 +1521,27 @@ def forward( return outputs +class GroundingDINOContrastiveEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.max_text_len = config.max_text_len + + def forward( + self, + vision_hidden_state: torch.FloatTensor, + text_hiddend_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor + ) -> torch.FloatTensor: + + + output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2) + output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device) + new_output[..., : output.shape[-1]] = output + + return new_output # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead class GroundingDINOClassificationHead(nn.Module): @@ -1836,6 +1857,12 @@ def __init__(self, config: GroundingDINOConfig): self.dropout = config.dropout self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = GroundingDINOMLPPredictionHead( + config.query_dim // 2 * config.d_model, + config.d_model, + config.d_model, + 2 + ) self.gradient_checkpointing = False # hack implementation for iterative bounding box refinement and two-stage Deformable DETR @@ -1846,6 +1873,45 @@ def __init__(self, config: GroundingDINOConfig): # Initialize weights and apply final processing self.post_init() + def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTensor: + """Get the position embedding of the proposals.""" + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries + pos_x = proposals[:, :, 0] * scale + pos_y = proposals[:, :, 1] * scale + # batch_size, num_queries, num_pos_feats + pos_x = pos_x[:, :, None] / dim_t + pos_y = pos_y[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + + if proposals.size(-1) == 2: + # batch_size, num_queries, num_pos_feats * 2 + pos = torch.cat((pos_y, pos_x), dim=2) + elif proposals.size(-1) == 4: + w_embed = proposals[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = proposals[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + # batch_size, num_queries, num_pos_feats + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + # batch_size, num_queries, num_pos_feats * 4 + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1))) + return pos + + + def forward( self, inputs_embeds, @@ -1853,7 +1919,6 @@ def forward( vision_encoder_attention_mask=None, text_encoder_hidden_states=None, text_encoder_attention_mask=None, - position_embeddings=None, reference_points=None, spatial_shapes=None, level_start_index=None, @@ -1875,8 +1940,6 @@ def forward( in `[0, 1]`: - 1 for pixels that are real (i.e. **not masked**), - 0 for pixels that are padding (i.e. **masked**). - position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Position embeddings that are added to the queries and keys in each self-attention layer. reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): @@ -1921,6 +1984,8 @@ def forward( if reference_points.shape[-1] != 2: raise ValueError("Reference points' last dimension must be of size 2") reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :]) + query_pos = self.reference_points_head(query_pos) if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1943,7 +2008,7 @@ def custom_forward(*inputs): else: layer_outputs = decoder_layer( hidden_states=hidden_states, - position_embeddings=position_embeddings, + position_embeddings=query_pos, reference_points=reference_points_input, spatial_shapes=spatial_shapes, level_start_index=level_start_index, @@ -2034,8 +2099,6 @@ def __init__(self, config: GroundingDINOConfig): backbone = GroundingDINOConvEncoder(config) position_embeddings = build_position_encoding(config) self.backbone = GroundingDINOConvModel(backbone, position_embeddings) - # Create text backbone - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) # Create input projection layers if config.num_feature_levels > 1: @@ -2057,9 +2120,9 @@ def __init__(self, config: GroundingDINOConfig): ) ) in_channels = config.d_model - self.input_proj = nn.ModuleList(input_proj_list) + self.input_proj_vision = nn.ModuleList(input_proj_list) else: - self.input_proj = nn.ModuleList( + self.input_proj_vision = nn.ModuleList( [ nn.Sequential( nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), @@ -2068,8 +2131,12 @@ def __init__(self, config: GroundingDINOConfig): ] ) - if not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + # Create text backbone + self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) + self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) + + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) self.encoder = GroundingDINOEncoder(config) self.decoder = GroundingDINODecoder(config) @@ -2079,10 +2146,8 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) - self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) - self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) else: - self.reference_points = nn.Linear(config.d_model, 2) + self.reference_points = nn.Embedding(config.num_queries, 4) self.post_init() @@ -2164,6 +2229,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) proposals.append(proposal) _cur += height * width + output_proposals = torch.cat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid @@ -2181,12 +2247,15 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, - pixel_values, - pixel_mask=None, - decoder_attention_mask=None, + pixel_values: Tensor, + input_ids: Tensor, + attention_mask: Tensor, + token_type_ids: Tensor, + text_token_mask: Tensor, + text_self_attention_masks: Tensor, + position_ids: Tensor, + pixel_mask: Optional[Tensor]=None, encoder_outputs=None, - inputs_embeds=None, - decoder_inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, @@ -2221,6 +2290,10 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Extract text features from text backbone + text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"] + text_features = self.input_proj_text(text_features) + batch_size, num_channels, height, width = pixel_values.shape device = pixel_values.device @@ -2230,13 +2303,13 @@ def forward( # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) # First, sent pixel_values + pixel_mask through Backbone to obtain the features # which is a list of tuples - features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) sources = [] masks = [] - for level, (source, mask) in enumerate(features): - sources.append(self.input_proj[level](source)) + for level, (source, mask) in enumerate(vision_features): + sources.append(self.input_proj_vision[level](source)) masks.append(mask) if mask is None: raise ValueError("No attention mask was provided") @@ -2246,9 +2319,9 @@ def forward( _len_sources = len(sources) for level in range(_len_sources, self.config.num_feature_levels): if level == _len_sources: - source = self.input_proj[level](features[-1][0]) + source = self.input_proj_vision[level](vision_features[-1][0]) else: - source = self.input_proj[level](sources[-1]) + source = self.input_proj_vision[level](sources[-1]) mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) sources.append(source) @@ -2257,7 +2330,7 @@ def forward( # Create queries query_embeds = None - if not self.config.two_stage: + if self.config.embedding_init_target or self.config.two_stage: query_embeds = self.query_position_embeddings.weight # Prepare encoder inputs (by flattening) @@ -2288,26 +2361,35 @@ def forward( # Also provide spatial_shapes, level_start_index and valid_ratios if encoder_outputs is None: encoder_outputs = self.encoder( - inputs_embeds=source_flatten, - attention_mask=mask_flatten, - position_embeddings=lvl_pos_embed_flatten, + vision_features=source_flatten, + vision_attention_mask=mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=text_token_mask, + text_position_embedding=None, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=return_dict ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput): + encoder_outputs = GroundingDINOEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, + attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None, + attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None, + cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None, + cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None, ) # Fifth, prepare decoder inputs - batch_size, _, num_channels = encoder_outputs[0].shape enc_outputs_class = None enc_outputs_coord_logits = None if self.config.two_stage: @@ -2318,14 +2400,19 @@ def forward( # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + enc_outputs_class = self.decoder.class_embed[-1]( + object_query_embedding, + encoder_outputs[1], + text_token_mask + ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) enc_outputs_coord_logits = delta_bbox + output_proposals # only keep top scoring `config.two_stage_num_proposals` proposals topk = self.config.two_stage_num_proposals - topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] topk_coords_logits = torch.gather( enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) ) @@ -2333,27 +2420,31 @@ def forward( topk_coords_logits = topk_coords_logits.detach() reference_points = topk_coords_logits.sigmoid() init_reference_points = reference_points - pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) - query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + if query_embeds: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() else: - query_embed, target = torch.split(query_embeds, num_channels, dim=1) - query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) - target = target.unsqueeze(0).expand(batch_size, -1, -1) - reference_points = self.reference_points(query_embed).sigmoid() + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() init_reference_points = reference_points decoder_outputs = self.decoder( inputs_embeds=target, - position_embeddings=query_embed, - encoder_hidden_states=encoder_outputs[0], - encoder_attention_mask=mask_flatten, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=text_token_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, + self_attn_mask=None, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=return_dict ) if not return_dict: @@ -2396,14 +2487,11 @@ def __init__(self, config: GroundingDINOConfig): self.model = GroundingDINOModel(config) # Detection heads on top - self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.class_embed = GroundingDINOContrastiveEmbedding(config) self.bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - prior_prob = 0.01 - bias_value = -math.log((1 - prior_prob) / prior_prob) - self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) From 88d07b3a3c293c8102f8f2ac3c2768985427cbc4 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 11 Sep 2023 23:40:10 -0300 Subject: [PATCH 053/252] Added all necessary layers, configurations and forward logic up to GroundingDINOModel --- .../configuration_grounding_dino.py | 19 +++++++ .../grounding_dino/modeling_grounding_dino.py | 52 +++++++++++-------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index bc43655df050ee..e413d43b55cd89 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -142,6 +142,14 @@ class GroundingDINOConfig(PretrainedConfig): The dropout ratio for the fusion module. embedding_init_target (`bool`, *optional*, defaults to `True`): Whether to initialize the target with Embedding weights. + query_dim (`int`, *optional*, defaults to 4): + The dimension of the query vector. + decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): + Whether to share the bbox embedding between the decoder and the two-stage bbox generator. + two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. + two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. Examples: @@ -215,6 +223,10 @@ def __init__( fusion_droppath = 0.1, fusion_dropout = 0.0, embedding_init_target = True, + query_dim = 4, + decoder_bbox_embed_share = True, + two_stage_bbox_embed_share = False, + two_stage_class_embed_share = False, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -282,7 +294,14 @@ def __init__( # Fusion self.fusion_droppath = fusion_droppath self.fusion_dropout = fusion_dropout + # Others self.embedding_init_target = embedding_init_target + self.query_dim = query_dim + self.decoder_bbox_embed_share = decoder_bbox_embed_share + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + if two_stage_bbox_embed_share and not decoder_bbox_embed_share: + raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") + self.two_stage_class_embed_share = two_stage_class_embed_share super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 35ed14fa6859bc..4c35a8cf4b7814 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1856,6 +1856,7 @@ def __init__(self, config: GroundingDINOConfig): super().__init__(config) self.dropout = config.dropout + self.layer_norm = nn.LayerNorm(config.d_model) self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.reference_points_head = GroundingDINOMLPPredictionHead( config.query_dim // 2 * config.d_model, @@ -2038,7 +2039,7 @@ def custom_forward(*inputs): new_reference_points = new_reference_points.sigmoid() reference_points = new_reference_points.detach() - intermediate += (hidden_states,) + intermediate += (self.layer_norm(hidden_states),) intermediate_reference_points += (reference_points,) if output_attentions: @@ -2146,6 +2147,8 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) + self.encoder_output_bbox_embed = None + self.encoder_output_class_embed = None else: self.reference_points = nn.Embedding(config.num_queries, 4) @@ -2400,13 +2403,13 @@ def forward( # hack implementation for two-stage Deformable DETR # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.decoder.class_embed[-1]( + enc_outputs_class = self.encoder_output_class_embed( object_query_embedding, encoder_outputs[1], text_token_mask ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) - delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) enc_outputs_coord_logits = delta_bbox + output_proposals # only keep top scoring `config.two_stage_num_proposals` proposals @@ -2487,32 +2490,35 @@ def __init__(self, config: GroundingDINOConfig): self.model = GroundingDINOModel(config) # Detection heads on top - self.class_embed = GroundingDINOContrastiveEmbedding(config) - self.bbox_embed = GroundingDINOMLPPredictionHead( + _class_embed = GroundingDINOContrastiveEmbedding(config) + _bbox_embed = GroundingDINOMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) - nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) - # if two-stage, the last class_embed and bbox_embed is for region proposal generation - num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers - if config.with_box_refine: - self.class_embed = _get_clones(self.class_embed, num_pred) - self.bbox_embed = _get_clones(self.bbox_embed, num_pred) - nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) - # hack implementation for iterative bounding box refinement - self.model.decoder.bbox_embed = self.bbox_embed + + if config.decoder_bbox_embed_share: + self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) else: - nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) - self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) - self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) - self.model.decoder.bbox_embed = None + self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) + self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) + # hack implementation for two-stage + self.model.decoder.bbox_embed = self.bbox_embed + self.model.decoder.class_embed = self.class_embed + if config.two_stage: - # hack implementation for two-stage - self.model.decoder.class_embed = self.class_embed - for box_embed in self.bbox_embed: - nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + if config.two_stage_bbox_embed_share: + self.model.encoder_output_bbox_embed = _bbox_embed + else: + self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed) + + #TODO don't believe this is necessary since class_embed has no parameters + if config.two_stage_class_embed_share: + self.model.encoder_output_class_embed = _class_embed + else: + self.model.encoder_output_class_embed = copy.deepcopy(_class_embed) # Initialize weights and apply final processing self.post_init() From f17bd3d6e5d6413613e24ee1777308c130523081 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 12 Sep 2023 00:16:28 -0300 Subject: [PATCH 054/252] Added all layers to convertion --- .../convert_grounding_dino_to_hf.py | 101 ++++++++++-------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index efced9cba0d522..4c74404b19b288 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -66,72 +66,66 @@ def create_rename_keys(state_dict, config): #TODO names might change after modifing GroundingDINOModel class ########################################## VISION BACKBONE - START # patch embedding layer - rename_keys.append(("module.backbone.0.patch_embed.proj.weight", + rename_keys.append(("backbone.0.patch_embed.proj.weight", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("module.backbone.0.patch_embed.proj.bias", + rename_keys.append(("backbone.0.patch_embed.proj.bias", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("module.backbone.0.patch_embed.norm.weight", + rename_keys.append(("backbone.0.patch_embed.norm.weight", "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("module.backbone.0.patch_embed.norm.bias", + rename_keys.append(("backbone.0.patch_embed.norm.bias", "model.backbone.conv_encoder.model.embeddings.norm.bias")) for layer, depth in enumerate(config.backbone_config.depths): for block in range(depth): # layernorms - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - # rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) # intermidiate - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) # output - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) # downsample if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.reduction.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"module.backbone.0.layers.{layer}.downsample.norm.bias", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) for out_indice in config.backbone_config.out_indices: # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"module.backbone.0.norm{out_indice-1}.weight", + rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"module.backbone.0.norm{out_indice-1}.bias", + rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) ########################################## VISION BACKBONE - END - ########################################## TEXT BACKBONE - START - for layer_name, params in state_dict.items(): - if "module.bert" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.bert", "model.text_backbone"))) - ########################################## TEXT BACKBONE - END - ########################################## ENCODER - START deformable_key_mappings = { 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', @@ -185,23 +179,21 @@ def create_rename_keys(state_dict, config): 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', } - for layer in range(config.encoder_layers): # deformable for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # text enhance for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.text_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # fusion layers for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"module.transformer.encoder.fusion_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) ########################################## ENCODER - END - #TODO convert decoder ########################################## DECODER - START key_mappings_decoder = { 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', @@ -234,7 +226,7 @@ def create_rename_keys(state_dict, config): 'norm3.bias': 'final_layer_norm.bias', } for layer_num in range(config.decoder_layers): - source_prefix_decoder = f'module.transformer.decoder.layers.{layer_num}.' + source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' target_prefix_decoder = f'model.decoder.layers.{layer_num}.' for source_name, target_name in key_mappings_decoder.items(): @@ -246,17 +238,36 @@ def create_rename_keys(state_dict, config): ########################################## HEAD - START ########################################## HEAD - END - #TODO convert additional layers ########################################## Additional - START for layer_name, params in state_dict.items(): + #### TEXT BACKBONE + if "bert" in layer_name: + rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE - if "module.input_proj" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.input_proj", "model.input_proj_vision"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE - if "module.feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("module.feat_map", "model.input_proj_text"))) - #### - + if "input_proj" in layer_name: + rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE + if "feat_map" in layer_name: + rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text"))) + #### DECODER REFERENCE POINT HEAD + if "transformer.decoder.ref_point_head" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", + "model.decoder.reference_points_head"))) + #### DECODER BBOX EMBED + if "transformer.decoder.bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", + "model.decoder.bbox_embed"))) + if "transformer.enc_output" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) + + if "transformer.enc_out_bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", + "model.encoder_output_bbox_embed"))) + + rename_keys.append(("transformer.level_embed", "model.level_embed")) + rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) + rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) + rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) ########################################## Additional - END # fmt: on @@ -274,8 +285,8 @@ def read_in_q_k_v(state_dict, config): hidden_size = embed_dim * 2**layer for block in range(depth): # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"module.backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") + in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") # next, add query, keys and values (in that order) to the state dict state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] @@ -382,7 +393,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): config = get_grounding_dino_config(model_name) # Load original checkpoint - original_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + original_state_dict = torch.load(checkpoint_path, map_location="cpu") # Rename keys new_state_dict = original_state_dict.copy() @@ -452,7 +463,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) parser.add_argument( "--checkpoint_path", - default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny.pth", + default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", type=str, help="Path to the original PyTorch checkpoint (.pth file).", ) From dcd1990175d41d3574be4fc23629661a3ca5868a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 12 Sep 2023 11:24:24 -0300 Subject: [PATCH 055/252] Fixed outputs for GroundingDINOModel and GroundingDINOForObjectDetection --- .../grounding_dino/modeling_grounding_dino.py | 156 +++++++++++++----- 1 file changed, 113 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 4c35a8cf4b7814..c3d094285dcf0d 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -228,10 +228,9 @@ class GroundingDINOEncoderOutput(ModelOutput): @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO class GroundingDINOModelOutput(ModelOutput): """ - Base class for outputs of the Deformable DETR encoder-decoder model. + Base class for outputs of the Grounding DINO encoder-decoder model. Args: init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): @@ -250,25 +249,47 @@ class GroundingDINOModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the - self-attention heads. - enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and background). - enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): Logits of predicted bounding boxes coordinates in the first stage. """ @@ -278,16 +299,21 @@ class GroundingDINOModelOutput(ModelOutput): intermediate_reference_points: torch.FloatTensor = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @dataclass -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->GroundingDINO class GroundingDINOObjectDetectionOutput(ModelOutput): """ Output type of [`GroundingDINOForObjectDetection`]. @@ -320,20 +346,42 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, - 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average - in the self-attention heads. + encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in + the multi-scale deformable attention heads. + encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in + the self-attention heads. + encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. + encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, + used to compute the weighted average in the bi-attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -359,12 +407,18 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): intermediate_reference_points: Optional[torch.FloatTensor] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - enc_outputs_class: Optional = None - enc_outputs_coord_logits: Optional = None + decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None def _get_clones(module, N): @@ -1988,8 +2042,11 @@ def forward( query_pos = self.get_proposal_pos_embed(reference_points_input[:, :, 0, :]) query_pos = self.reference_points_head(query_pos) + # In original implementation they apply layer norm before outputting intermediate hidden states + # Though that's not through between layers so the layers use as input the output of the previous layer + # withtout layer norm if output_hidden_states: - all_hidden_states += (hidden_states,) + all_hidden_states += (self.layer_norm(hidden_states),) if self.gradient_checkpointing and self.training: @@ -2055,6 +2112,7 @@ def custom_forward(*inputs): # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + hidden_states = self.layer_norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: @@ -2463,10 +2521,16 @@ def forward( intermediate_reference_points=decoder_outputs.intermediate_reference_points, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, - cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, + decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions, + decoder_cross_attentions_text=decoder_outputs.text_cross_attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, + encoder_hidden_states_text=encoder_outputs.hidden_states_text, + encoder_attentions_vision=encoder_outputs.attentions_vision, + encoder_attentions_text=encoder_outputs.attentions_text, + encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision, + encoder_cross_attentions_text=encoder_outputs.cross_attentions_text, enc_outputs_class=enc_outputs_class, enc_outputs_coord_logits=enc_outputs_coord_logits, ) @@ -2588,7 +2652,7 @@ def forward( ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # First, sent images through DETR base model to obtain encoder + decoder outputs + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( pixel_values, pixel_mask=pixel_mask, @@ -2688,10 +2752,16 @@ def forward( last_hidden_state=outputs.last_hidden_state, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, + decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision, + decoder_cross_attentions_text=outputs.decoder_cross_attentions_text, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_hidden_states_vision=outputs.encoder_hidden_states_vision, + encoder_hidden_states_text=outputs.encoder_hidden_states_text, + encoder_attentions_vision=outputs.encoder_attentions_vision, + encoder_attentions_text=outputs.encoder_attentions_text, + encoder_cross_attentions_text=outputs.encoder_cross_attentions_text, + encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision, intermediate_hidden_states=outputs.intermediate_hidden_states, intermediate_reference_points=outputs.intermediate_reference_points, init_reference_points=outputs.init_reference_points, From 39a161c86a2867ffec0cc0e0bf7cdd0d4229bd7d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 11:58:02 -0300 Subject: [PATCH 056/252] Fixed mask input to encoders and fixed nn.MultiheadAttention batch first and attn output --- .../convert_grounding_dino_to_hf.py | 30 ++++----- .../grounding_dino/modeling_grounding_dino.py | 61 ++++++++++++------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 4c74404b19b288..15793a0df03ae7 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -385,7 +385,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token tokenized_for_encoder["attention_mask"] = text_self_attention_masks tokenized_for_encoder["position_ids"] = position_ids - return tokenized_for_encoder + return tokenized_for_encoder, tokenized.attention_mask.bool() @torch.no_grad() def convert_grounding_dino_checkpoint(model_name, checkpoint_path): @@ -418,25 +418,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ] ) image_inputs = image_processor(image) - text_inputs = text_processor(text, config) - - pixel_mask = torch.ones( - ((1, image_inputs.shape[1], image_inputs.shape[2])), - dtype=torch.long, - device=image_inputs.device + text_inputs, text_token_mask = text_processor(text, config) + + outputs = model( + pixel_values=image_inputs.unsqueeze(0), + input_ids=text_inputs["input_ids"], + attention_mask=text_inputs["attention_mask"], + token_type_ids=text_inputs["token_type_ids"], + text_token_mask=text_token_mask, + text_self_attention_masks=text_inputs["attention_mask"], + position_ids=text_inputs["position_ids"], ) - # output = model.model.backbone.conv_encoder.model(pixel_values=image_inputs.unsqueeze(0)) - output = model.model.text_backbone(**text_inputs) - print(output.last_hidden_state[:, :, :5]) - - # for feature_map in output.last_hidden_state: - # print(f"{feature_map.shape}") - # print(f"\t {feature_map[:, :5, 0, 0].cpu().numpy()}") - - # outputs = model(**inputs).logits - - # print(outputs.keys()) - # print("Looks ok!") # if pytorch_dump_folder_path is not None: # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index c3d094285dcf0d..2cc715b10cce4f 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -970,7 +970,8 @@ def __init__(self, config): self.self_attn = nn.MultiheadAttention( embed_dim=config.d_model, num_heads=config.encoder_attention_heads // 2, - dropout=config.text_enhancer_dropout + dropout=config.text_enhancer_dropout, + batch_first=True, ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) @@ -999,7 +1000,13 @@ def forward( attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) - attention_output, attention_weights = self.self_attn(q, k, value=hidden_states, attn_mask=attention_masks) + attention_output, attention_weights = self.self_attn( + query=q, + key=k, + value=hidden_states, + attn_mask=attention_masks, + average_attn_weights=False + ) hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) @@ -1233,8 +1240,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at (delta_v, vision_attn), (delta_t, text_attn) = self.attn( vision_features, text_features, - attention_mask_vision=attention_mask_vision, - attention_mask_text=attention_mask_text + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text ) vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) text_features = text_features + self.drop_path(self.gamma_l * delta_t) @@ -1448,6 +1455,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, + batch_first=True ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] @@ -1459,6 +1467,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, + batch_first=True ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention @@ -1518,7 +1527,8 @@ def forward( query=self.with_pos_embed(hidden_states, position_embeddings), key=self.with_pos_embed(hidden_states, position_embeddings), value=hidden_states, - attn_mask=self_attn_mask + attn_mask=self_attn_mask, + average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1533,6 +1543,7 @@ def forward( key=text_encoder_hidden_states.transpose(0, 1), value=text_encoder_hidden_states.transpose(0, 1), attn_mask=text_encoder_attention_mask, + average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -2423,13 +2434,13 @@ def forward( if encoder_outputs is None: encoder_outputs = self.encoder( vision_features=source_flatten, - vision_attention_mask=mask_flatten, + vision_attention_mask=~mask_flatten, vision_position_embedding=lvl_pos_embed_flatten, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, text_features=text_features, - text_attention_mask=text_token_mask, + text_attention_mask=~text_token_mask, text_position_embedding=None, text_self_attention_masks=text_self_attention_masks, text_position_ids=position_ids, @@ -2599,16 +2610,19 @@ def _set_aux_loss(self, outputs_class, outputs_coord): @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( self, - pixel_values, - pixel_mask=None, - decoder_attention_mask=None, - encoder_outputs=None, - inputs_embeds=None, - decoder_inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + attention_mask: torch.BoolTensor, + token_type_ids: torch.LongTensor, + text_token_mask: torch.BoolTensor, + text_self_attention_masks: torch.BoolTensor, + position_ids: torch.LongTensor, + pixel_mask: Optional[torch.BoolTensor]=None, + encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ): r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2654,12 +2668,15 @@ def forward( # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( - pixel_values, - pixel_mask=pixel_mask, - decoder_attention_mask=decoder_attention_mask, + pixel_values=pixel_values , + input_ids=input_ids , + attention_mask=attention_mask , + token_type_ids=token_type_ids , + text_token_mask=text_token_mask , + text_self_attention_masks=text_self_attention_masks , + position_ids=position_ids , + pixel_mask=pixel_mask , encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, From 5ec72fb47b27a3101ca5c087b5f2ccf49621da18 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 14:14:57 -0300 Subject: [PATCH 057/252] Fixed forward from GroundingDINOTextEnhancerLayer --- .../grounding_dino/modeling_grounding_dino.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2cc715b10cce4f..36822d53eaa9ab 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -975,16 +975,14 @@ def __init__(self, config): ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) - self.dropout = nn.Dropout(config.text_enhancer_dropout) self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) self.layer_norm_before = nn.LayerNorm(config.d_model) self.layer_norm_after = nn.LayerNorm(config.d_model) - self.dropout1 = nn.Dropout(config.text_enhancer_dropout) - self.dropout2 = nn.Dropout(config.text_enhancer_dropout) self.activation = ACT2FN[config.activation_function] self.num_heads = config.encoder_attention_heads // 2 + self.dropout = config.text_enhancer_dropout def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): return hidden_state if position_embeddings is None else hidden_state + position_embeddings @@ -995,7 +993,7 @@ def forward( attention_masks: Optional[Tensor] = None, position_embeddings: Optional[Tensor] = None, ): # repeat attn mask - if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[1]: + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) @@ -1007,13 +1005,18 @@ def forward( attn_mask=attention_masks, average_attn_weights=False ) + attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) + hidden_states = hidden_states + attention_output + residual = hidden_states - hidden_states = hidden_states + self.dropout1(attention_output) hidden_states = self.layer_norm_before(hidden_states) hidden_states = self.activation(self.fc1(hidden_states)) - attention_output = self.fc2(self.dropout(hidden_states)) - hidden_states = hidden_states + self.dropout2(attention_output) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = hidden_states + residual hidden_states = self.layer_norm_after(hidden_states) + return hidden_states, attention_weights class GroundingDINOBiMultiHeadAttention(nn.Module): @@ -1423,12 +1426,10 @@ def forward( ) (text_features, text_enhanced_attn) = self.text_enhancer_layer( - hidden_states=text_features.transpose(0, 1), + hidden_states=text_features, attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=( - text_position_embedding.transpose(0, 1) if text_position_embedding is not None else None - ), - ).transpose(0, 1) + position_embeddings=(text_position_embedding if text_position_embedding is not None else None) + ) (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, From 086f68a70351c826b01e2c25efc9bd5d8187c44b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 13 Sep 2023 14:31:17 -0300 Subject: [PATCH 058/252] Fixed output bug with GroundingDINODeformableLayer --- .../models/grounding_dino/modeling_grounding_dino.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 36822d53eaa9ab..e8e147cb00554a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1329,12 +1329,7 @@ def forward( clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights def get_sine_pos_embed( pos_tensor: torch.Tensor, From f75cda2f12466d3e281eb6e4bf5d24f8f7dd8d8a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 15 Sep 2023 18:57:37 -0300 Subject: [PATCH 059/252] Fixed bugs that prevent GroundingDINOForObjectDetection to run forward method --- .../configuration_grounding_dino.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e413d43b55cd89..3a62780362d834 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -204,7 +204,7 @@ def __init__( encoder_n_points=4, decoder_n_points=4, two_stage=True, - two_stage_num_proposals=300, + two_stage_num_proposals=900, with_box_refine=True, class_cost=1, bbox_cost=5, diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index e8e147cb00554a..2e9d7d3d0de7f5 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1536,9 +1536,9 @@ def forward( # Cross-Attention Text hidden_states, text_cross_attn_weights = self.encoder_attn_text( query=self.with_pos_embed(hidden_states, position_embeddings), - key=text_encoder_hidden_states.transpose(0, 1), - value=text_encoder_hidden_states.transpose(0, 1), - attn_mask=text_encoder_attention_mask, + key=text_encoder_hidden_states, + value=text_encoder_hidden_states, + key_padding_mask=text_encoder_attention_mask, average_attn_weights=False ) @@ -1590,12 +1590,12 @@ def __init__(self, config): def forward( self, vision_hidden_state: torch.FloatTensor, - text_hiddend_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, text_token_mask: torch.BoolTensor ) -> torch.FloatTensor: - output = vision_hidden_state @ text_hiddend_state.transpose(-1, -2) + output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) # padding to max_text_len @@ -1867,7 +1867,7 @@ def forward( text_position_embedding=text_position_embedding, text_self_attention_masks=text_self_attention_masks, text_position_ids=text_position_ids - ) + ) if output_attentions: @@ -2488,7 +2488,7 @@ def forward( topk_coords_logits = topk_coords_logits.detach() reference_points = topk_coords_logits.sigmoid() init_reference_points = reference_points - if query_embeds: + if query_embeds is not None: target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) else: target = torch.gather( @@ -2679,6 +2679,7 @@ def forward( ) hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9] init_reference = outputs.init_reference_points if return_dict else outputs[0] inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] @@ -2692,7 +2693,11 @@ def forward( else: reference = inter_references[:, level - 1] reference = inverse_sigmoid(reference) - outputs_class = self.class_embed[level](hidden_states[:, level]) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=text_token_mask + ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: outputs_coord_logits = delta_bbox + reference From 8dbed3d4d7bf24ca28d47040b4d4187848cb6381 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 22:37:58 -0300 Subject: [PATCH 060/252] Fixed attentions to be passed correctly --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2e9d7d3d0de7f5..edbab3773a4fcd 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2504,7 +2504,7 @@ def forward( vision_encoder_hidden_states=encoder_outputs[0], vision_encoder_attention_mask=mask_flatten, text_encoder_hidden_states=encoder_outputs[1], - text_encoder_attention_mask=text_token_mask, + text_encoder_attention_mask=~text_token_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, From a2af17210ceaf6d2ff9dcef558aa7748bad1274d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:46:17 -0300 Subject: [PATCH 061/252] Passing temperature arg when creating Sine position embedding --- .../models/grounding_dino/modeling_grounding_dino.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index edbab3773a4fcd..671092a234ee04 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -594,7 +594,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[in return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->GroundingDINO class GroundingDINOSinePositionEmbedding(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you @@ -619,8 +618,8 @@ def forward(self, pixel_values, pixel_mask): x_embed = pixel_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 - y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) @@ -662,7 +661,7 @@ def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": # TODO find a better way of exposing other arguments - position_embedding = GroundingDINOSinePositionEmbedding(n_steps, normalize=True) + position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True) elif config.position_embedding_type == "learned": position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) else: From 759fc1461d2c8adaa02043a5980de352e4317b9e Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:47:09 -0300 Subject: [PATCH 062/252] Removed copy comments --- .../models/grounding_dino/modeling_grounding_dino.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 671092a234ee04..000c3e1f23ff1f 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -656,7 +656,6 @@ def forward(self, pixel_values, pixel_mask=None): return pos -# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->GroundingDINO def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": From 51963733ae9ffc9a95def9b8751d0d103a8b457f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:48:03 -0300 Subject: [PATCH 063/252] Added temperature argument for position embedding --- .../models/grounding_dino/configuration_grounding_dino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 3a62780362d834..e321782b197810 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -150,7 +150,8 @@ class GroundingDINOConfig(PretrainedConfig): Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. - + positional_embedding_temperature (`float`, *optional*, defaults to 20): + The temperature for Sine Positional Embedding that is used together with vision backbone. Examples: ```python @@ -227,6 +228,7 @@ def __init__( decoder_bbox_embed_share = True, two_stage_bbox_embed_share = False, two_stage_class_embed_share = False, + positional_embedding_temperature = 20, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -302,6 +304,7 @@ def __init__( if two_stage_bbox_embed_share and not decoder_bbox_embed_share: raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") self.two_stage_class_embed_share = two_stage_class_embed_share + self.positional_embedding_temperature = positional_embedding_temperature super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property From 900cff443b26aa3bcf4bada6c4fb4263e5c1f116 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 17 Sep 2023 23:48:36 -0300 Subject: [PATCH 064/252] Fixed typo when converting weigths to GroundingDINO vision backbone --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 15793a0df03ae7..3fe62356b8e7d9 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -84,7 +84,7 @@ def create_rename_keys(state_dict, config): f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", - f"encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention @@ -430,6 +430,8 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): position_ids=text_inputs["position_ids"], ) + print("Finished") + # if pytorch_dump_folder_path is not None: # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") # model.save_pretrained(pytorch_dump_folder_path) From f23a54aef775194ecac707b1cb29c0787760f01a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:31:38 -0300 Subject: [PATCH 065/252] Final modifications on modeling --- .../grounding_dino/modeling_grounding_dino.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 000c3e1f23ff1f..92ccdb41bab011 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1005,9 +1005,9 @@ def forward( ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output - residual = hidden_states - hidden_states = self.layer_norm_before(hidden_states) + + residual = hidden_states hidden_states = self.activation(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = self.fc2(hidden_states) @@ -1426,7 +1426,7 @@ def forward( (vision_features, vision_deformable_attn) = self.deformable_layer( hidden_states=vision_features, - attention_mask=key_padding_mask, + attention_mask=~key_padding_mask, position_embeddings=vision_position_embedding, reference_points=reference_points, spatial_shapes=spatial_shapes, @@ -1517,9 +1517,10 @@ def forward( residual = hidden_states # Self Attention + q = k = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, self_attn_weights = self.self_attn( - query=self.with_pos_embed(hidden_states, position_embeddings), - key=self.with_pos_embed(hidden_states, position_embeddings), + query=q, + key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False @@ -1826,9 +1827,6 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - #TODO check if this is necessary according to original implementation - vision_features = nn.functional.dropout(vision_features, p=self.dropout, training=self.training) - reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) encoder_vision_states = () if output_hidden_states else None From 3090b2c3d48edb7510004b579626be5f583f2bb1 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:41:35 -0300 Subject: [PATCH 066/252] Removed unnecessary class --- .../grounding_dino/modeling_grounding_dino.py | 119 ------------------ 1 file changed, 119 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 92ccdb41bab011..94090841784322 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -841,125 +841,6 @@ def forward( return output, attention_weights - -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO -class GroundingDINOMultiheadAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. - - Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). - """ - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - bias: bool = True, - ): - super().__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - if self.head_dim * num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {num_heads})." - ) - self.scaling = self.head_dim**-0.5 - - self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - - def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): - return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, target_len, embed_dim = hidden_states.size() - # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: - hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) - - # get queries, keys and values - query_states = self.q_proj(hidden_states) * self.scaling - key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) - value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) - - proj_shape = (batch_size * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - source_len = key_states.size(1) - - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): - raise ValueError( - f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" - f" {attn_weights.size()}" - ) - - # expand attention_mask - if attention_mask is not None: - # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] - attention_mask = _expand_mask(attention_mask, hidden_states.dtype) - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, target_len, source_len): - raise ValueError( - f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" - f" {attention_mask.size()}" - ) - attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask - attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) - attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(batch_size, target_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped - -# Repeting some code to avoid convert nn.MultiheadAttention later #TODO is this an approriate way to name this? class GroundingDINOTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" From 5c19e7548570e3c9cfffac850830b2b19a1406f3 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 20 Sep 2023 02:42:41 -0300 Subject: [PATCH 067/252] Fixed convert structure --- .../convert_grounding_dino_to_hf.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 3fe62356b8e7d9..5dcaad277092ca 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -388,7 +388,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return tokenized_for_encoder, tokenized.attention_mask.bool() @torch.no_grad() -def convert_grounding_dino_checkpoint(model_name, checkpoint_path): +def convert_grounding_dino_checkpoint( + model_name: str, + checkpoint_path: str, + pytorch_dump_folder_path: str = None, + push_to_hub: bool = False +): #Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) @@ -420,6 +425,7 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): image_inputs = image_processor(image) text_inputs, text_token_mask = text_processor(text, config) + # Running forward outputs = model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], @@ -430,19 +436,17 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): position_ids=text_inputs["position_ids"], ) - print("Finished") + if pytorch_dump_folder_path is not None: + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) - # if pytorch_dump_folder_path is not None: - # print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - # model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) - # print(f"Saving image processor to {pytorch_dump_folder_path}") - # image_processor.save_pretrained(pytorch_dump_folder_path) - - # if push_to_hub: - # print(f"Pushing model and image processor for {model_name} to hub") - # model.push_to_hub(f"microsoft/{model_name}") - # image_processor.push_to_hub(f"microsoft/{model_name}") + if push_to_hub: + print(f"Pushing model and image processor for {model_name} to hub") + model.push_to_hub(f"microsoft/{model_name}") + image_processor.push_to_hub(f"microsoft/{model_name}") if __name__ == "__main__": @@ -469,4 +473,9 @@ def convert_grounding_dino_checkpoint(model_name, checkpoint_path): ) args = parser.parse_args() - convert_grounding_dino_checkpoint(args.model_name, args.checkpoint_path) \ No newline at end of file + convert_grounding_dino_checkpoint( + args.model_name, + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.push_to_hub + ) \ No newline at end of file From aec2f682649398617bbec5bfcaeb2ef00356032b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 24 Sep 2023 01:35:07 -0300 Subject: [PATCH 068/252] Added image processing --- .../image_processing_grounding_dino.py | 967 ++++++++++++++++++ 1 file changed, 967 insertions(+) create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py new file mode 100644 index 00000000000000..1adf8e8e0dcd62 --- /dev/null +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -0,0 +1,967 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_coco_detection_annotations, + valid_images, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotionFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (height <= width and height == size) or (width <= height and width == size): + return height, width + + if width < height: + ow = size + oh = int(size * height / width) + else: + oh = size + ow = int(size * width / height) + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + +def prepare_coco_detection_annotation( + image, + target, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GroundingDINO. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + keypoints = np.asarray(keypoints, dtype=np.float32) + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints[keep] + + return new_target + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +class GroundingDINOImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in + the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be + overridden by the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: np.ndarray, + target: Dict, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into Grounding DINO model. + """ + target = prepare_coco_detection_annotation( + image, target, input_data_format=input_data_format + ) + + return target + + def prepare(self, image, target): + logger.warning_once( + "The `prepare` method is deprecated and will be removed in a v4.33. " + "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " + "does not return the image anymore.", + ) + target = self.prepare_annotation(image, target) + return image, target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or + `height` and `width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> Dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: List[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`np.ndarray`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + padded_images = [ + self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image after resizing. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + max_size = None + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, max_size=max_size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_pad = self.do_pad if do_pad is None else do_pad + + if do_resize is not None and size is None: + raise ValueError("Size and max_size must be specified if do_resize is True.") + + if do_rescale is not None and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize is not None and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + images = make_list_of_images(images) + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if not valid_coco_detection_annotations(annotations): + raise ValueError( + "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts" + "(batch of images) with the following keys: `image_id` and `annotations`, with the latter " + "being a list of annotations in the COCO format." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + if annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + data = self.pad( + images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + data = {"pixel_values": images} + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # POSTPROCESSING METHODS - TODO: add support for other frameworks + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`GroundingDINOForObjectDetection`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`GroundingDINOForObjectDetection`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results From b7a79cd1229d379ba546c67eaf086cbd5dfdc7c5 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sun, 24 Sep 2023 01:37:59 -0300 Subject: [PATCH 069/252] make fixup partially completed --- docs/source/en/tasks/object_detection.md | 2 +- src/transformers/__init__.py | 32 +- src/transformers/models/__init__.py | 2 +- .../models/auto/configuration_auto.py | 6 +- .../models/auto/feature_extraction_auto.py | 1 - .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 4 +- .../configuration_grounding_dino.py | 35 +- .../convert_grounding_dino_to_hf.py | 163 +++---- .../grounding_dino/modeling_grounding_dino.py | 405 +++++++++--------- .../processing_grounding_dino.py | 0 .../tokenization_grounding_dino.py | 0 src/transformers/utils/dummy_pt_objects.py | 48 +-- utils/check_repo.py | 1 + 14 files changed, 347 insertions(+), 354 deletions(-) create mode 100644 src/transformers/models/grounding_dino/processing_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 8ed9da455bf7ba..58ec02e80cadf7 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [Grounding DINO](../model_doc/grounding-dino), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ff461296c5e76e..309ce05c8345e9 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -275,7 +275,6 @@ "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"], "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"], "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"], - "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"], "models.deprecated": [], "models.deprecated.bort": [], @@ -359,6 +358,7 @@ "GPTSanJapaneseTokenizer", ], "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroupViTConfig", @@ -1592,14 +1592,6 @@ "DeformableDetrPreTrainedModel", ] ) - _import_structure["models.grounding_dino"].extend( - [ - "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", - "GroundingDINOForObjectDetection", - "GroundingDINOModel", - "GroundingDINOPreTrainedModel", - ] - ) _import_structure["models.deit"].extend( [ "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -1972,6 +1964,14 @@ "GraphormerPreTrainedModel", ] ) + _import_structure["models.grounding_dino"].extend( + [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDINOForObjectDetection", + "GroundingDINOModel", + "GroundingDINOPreTrainedModel", + ] + ) _import_structure["models.groupvit"].extend( [ "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4437,7 +4437,6 @@ DecisionTransformerConfig, ) from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig - from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig from .models.deprecated.mctct import ( MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -4513,6 +4512,7 @@ GPTSanJapaneseTokenizer, ) from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GroupViTConfig, @@ -5593,12 +5593,6 @@ DeformableDetrModel, DeformableDetrPreTrainedModel, ) - from .models.grounding_dino import ( - GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, - GroundingDINOForObjectDetection, - GroundingDINOModel, - GroundingDINOPreTrainedModel, - ) from .models.deit import ( DEIT_PRETRAINED_MODEL_ARCHIVE_LIST, DeiTForImageClassification, @@ -5902,6 +5896,12 @@ GraphormerModel, GraphormerPreTrainedModel, ) + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDINOForObjectDetection, + GroundingDINOModel, + GroundingDINOPreTrainedModel, + ) from .models.groupvit import ( GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, GroupViTModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index cf718e4453f79d..ec035913f29398 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -61,7 +61,6 @@ deberta_v2, decision_transformer, deformable_detr, - grounding_dino, deit, deprecated, deta, @@ -100,6 +99,7 @@ gptj, gptsan_japanese, graphormer, + grounding_dino, groupvit, herbert, hubert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index ca005bbc79df90..0b892f7f642642 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -73,7 +73,6 @@ ("deberta-v2", "DebertaV2Config"), ("decision_transformer", "DecisionTransformerConfig"), ("deformable_detr", "DeformableDetrConfig"), - ("grounding-dino", "GroundingDINOConfig"), ("deit", "DeiTConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), @@ -109,6 +108,7 @@ ("gptj", "GPTJConfig"), ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), + ("grounding-dino", "GroundingDINOConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), @@ -288,7 +288,6 @@ ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -322,6 +321,7 @@ ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("grounding-dino", "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -494,7 +494,6 @@ ("deberta-v2", "DeBERTa-v2"), ("decision_transformer", "Decision Transformer"), ("deformable_detr", "Deformable DETR"), - ("grounding-dino", "Grounding DINO"), ("deit", "DeiT"), ("deplot", "DePlot"), ("deta", "DETA"), @@ -535,6 +534,7 @@ ("gptj", "GPT-J"), ("gptsan-japanese", "GPTSAN-japanese"), ("graphormer", "Graphormer"), + ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), ("hubert", "Hubert"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 5bc4db87f7048b..befca6a64b81b7 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -50,7 +50,6 @@ ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("data2vec-vision", "BeitFeatureExtractor"), ("deformable_detr", "DeformableDetrFeatureExtractor"), - ("grounding-dino", "GroundingDINOFeatureExtractor"), ("deit", "DeiTFeatureExtractor"), ("detr", "DetrFeatureExtractor"), ("dinat", "ViTFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index a791255829287d..6399fe192616af 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -53,7 +53,6 @@ ("cvt", "ConvNextImageProcessor"), ("data2vec-vision", "BeitImageProcessor"), ("deformable_detr", "DeformableDetrImageProcessor"), - ("grounding-dino", "GroundingDINOImageProcessor"), ("deit", "DeiTImageProcessor"), ("deta", "DetaImageProcessor"), ("detr", "DetrImageProcessor"), @@ -67,6 +66,7 @@ ("focalnet", "BitImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), + ("grounding-dino", "GroundingDINOImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 842af5c5272abc..45669e3ad8b4ac 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -71,7 +71,6 @@ ("deberta-v2", "DebertaV2Model"), ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), - ("grounding-dino", "GroundingDINOModel"), ("deit", "DeiTModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), @@ -106,6 +105,7 @@ ("gptj", "GPTJModel"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), + ("grounding-dino", "GroundingDINOModel"), ("groupvit", "GroupViTModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), @@ -630,9 +630,9 @@ # Model for Object Detection mapping ("conditional_detr", "ConditionalDetrForObjectDetection"), ("deformable_detr", "DeformableDetrForObjectDetection"), - ("grounding-dino", "GroundingDINOForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("grounding-dino", "GroundingDINOForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), ("yolos", "YolosForObjectDetection"), ] diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e321782b197810..09b9c41f131964 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -26,11 +26,10 @@ } - class GroundingDINOConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate - a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a + Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Grounding DINO [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. @@ -147,9 +146,11 @@ class GroundingDINOConfig(PretrainedConfig): decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): Whether to share the bbox embedding between the decoder and the two-stage bbox generator. two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): - Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal + generation. two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): - Whether to share the class embedding between the two-stage bbox generator and the region proposal generation. + Whether to share the class embedding between the two-stage bbox generator and the region proposal + generation. positional_embedding_temperature (`float`, *optional*, defaults to 20): The temperature for Sine Positional Embedding that is used together with vision backbone. Examples: @@ -217,18 +218,18 @@ def __init__( eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, - #other parameters - max_text_len = 256, - sub_sentence_present = True, - text_enhancer_dropout = 0.0, - fusion_droppath = 0.1, - fusion_dropout = 0.0, - embedding_init_target = True, - query_dim = 4, - decoder_bbox_embed_share = True, - two_stage_bbox_embed_share = False, - two_stage_class_embed_share = False, - positional_embedding_temperature = 20, + # other parameters + max_text_len=256, + sub_sentence_present=True, + text_enhancer_dropout=0.0, + fusion_droppath=0.1, + fusion_dropout=0.0, + embedding_init_target=True, + query_dim=4, + decoder_bbox_embed_share=True, + two_stage_bbox_embed_share=False, + two_stage_class_embed_share=False, + positional_embedding_temperature=20, **kwargs, ): if backbone_config is not None and use_timm_backbone: diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 5dcaad277092ca..4f2f3716329ed4 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -14,7 +14,8 @@ # limitations under the License. """Convert GroundingDINO SimMIM checkpoints from the original repository. -URL: https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" +URL: +https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" import argparse @@ -22,11 +23,9 @@ import torch from PIL import Image from torchvision import transforms as T -import torchvision.transforms.functional as F -from transformers import ( - GroundingDINOConfig, GroundingDINOForObjectDetection, AutoTokenizer -) +from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection + IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] @@ -66,64 +65,64 @@ def create_rename_keys(state_dict, config): #TODO names might change after modifing GroundingDINOModel class ########################################## VISION BACKBONE - START # patch embedding layer - rename_keys.append(("backbone.0.patch_embed.proj.weight", + rename_keys.append(("backbone.0.patch_embed.proj.weight", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.patch_embed.proj.bias", + rename_keys.append(("backbone.0.patch_embed.proj.bias", "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.patch_embed.norm.weight", + rename_keys.append(("backbone.0.patch_embed.norm.weight", "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.patch_embed.norm.bias", + rename_keys.append(("backbone.0.patch_embed.norm.bias", "model.backbone.conv_encoder.model.embeddings.norm.bias")) for layer, depth in enumerate(config.backbone_config.depths): for block in range(depth): # layernorms - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) # attention - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", + # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) # intermidiate - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) - + # output - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) - + # downsample if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) - + for out_indice in config.backbone_config.out_indices: # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", + rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", + rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) - + ########################################## VISION BACKBONE - END ########################################## ENCODER - START @@ -182,15 +181,15 @@ def create_rename_keys(state_dict, config): for layer in range(config.encoder_layers): # deformable for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # text enhance for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) # fusion layers for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", + rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", f"model.encoder.layers.{layer}.{dest}")) ########################################## ENCODER - END @@ -230,7 +229,7 @@ def create_rename_keys(state_dict, config): target_prefix_decoder = f'model.decoder.layers.{layer_num}.' for source_name, target_name in key_mappings_decoder.items(): - rename_keys.append((source_prefix_decoder + source_name, + rename_keys.append((source_prefix_decoder + source_name, target_prefix_decoder + target_name)) ########################################## DECODER - END @@ -240,7 +239,7 @@ def create_rename_keys(state_dict, config): ########################################## Additional - START for layer_name, params in state_dict.items(): - #### TEXT BACKBONE + #### TEXT BACKBONE if "bert" in layer_name: rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE @@ -251,19 +250,19 @@ def create_rename_keys(state_dict, config): rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text"))) #### DECODER REFERENCE POINT HEAD if "transformer.decoder.ref_point_head" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", "model.decoder.reference_points_head"))) #### DECODER BBOX EMBED if "transformer.decoder.bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", "model.decoder.bbox_embed"))) if "transformer.enc_output" in layer_name: rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) - + if "transformer.enc_out_bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", + rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", "model.encoder_output_bbox_embed"))) - + rename_keys.append(("transformer.level_embed", "model.level_embed")) rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) @@ -273,10 +272,12 @@ def create_rename_keys(state_dict, config): # fmt: on return rename_keys + def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val + # we split up the matrix of each encoder layer into queries, keys and values def read_in_q_k_v(state_dict, config): ########################################## VISION BACKBONE - START @@ -288,14 +289,26 @@ def read_in_q_k_v(state_dict, config): in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"] = in_proj_bias[: hidden_size] - - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"] = in_proj_bias[-hidden_size :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" + ] = in_proj_weight[:hidden_size, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" + ] = in_proj_bias[:hidden_size] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" + ] = in_proj_weight[hidden_size : hidden_size * 2, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" + ] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" + ] = in_proj_weight[-hidden_size:, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" + ] = in_proj_bias[-hidden_size:] ########################################## VISION BACKBONE - END @@ -305,12 +318,14 @@ def prepare_img(): image = Image.open(requests.get(url, stream=True).raw).convert("RGB") return image + def text_processor(text: str, config): def preprocess_caption(caption: str) -> str: result = caption.lower().strip() if result.endswith("."): return result return result + "." + def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: """Generate attention mask between each pair of special tokens Args: @@ -330,9 +345,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token idxs = torch.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = ( - torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) - ) + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) position_ids = torch.zeros((bs, num_token), device=input_ids.device) cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 @@ -352,8 +365,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token previous_col = col cate_to_token_mask_list = [ - torch.stack(cate_to_token_mask_listi, dim=0) - for cate_to_token_mask_listi in cate_to_token_mask_list + torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list ] # # padding mask @@ -361,23 +373,23 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() return attention_mask, position_ids.to(torch.long) + tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( - tokenized, special_tokens) - + tokenized, special_tokens + ) + max_text_len = config.max_text_len sub_sentence_present = config.sub_sentence_present if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[ - :, : max_text_len, : max_text_len - ] - position_ids = position_ids[:, : max_text_len] - tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len] + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] # extract text embeddings if sub_sentence_present: @@ -387,14 +399,12 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return tokenized_for_encoder, tokenized.attention_mask.bool() + @torch.no_grad() def convert_grounding_dino_checkpoint( - model_name: str, - checkpoint_path: str, - pytorch_dump_folder_path: str = None, - push_to_hub: bool = False + model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False ): - #Define default GroundingDINO configuation + # Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) # Load original checkpoint @@ -403,7 +413,7 @@ def convert_grounding_dino_checkpoint( # Rename keys new_state_dict = original_state_dict.copy() rename_keys = create_rename_keys(original_state_dict, config) - + for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) @@ -416,17 +426,13 @@ def convert_grounding_dino_checkpoint( image = prepare_img() text = "a cat" image_processor = T.Compose( - [ - T.Resize(size=800, max_size=1333), - T.ToTensor(), - T.Normalize(IMAGENET_MEAN, IMAGENET_STD) - ] + [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)] ) image_inputs = image_processor(image) text_inputs, text_token_mask = text_processor(text, config) # Running forward - outputs = model( + model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"], @@ -474,8 +480,5 @@ def convert_grounding_dino_checkpoint( args = parser.parse_args() convert_grounding_dino_checkpoint( - args.model_name, - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.push_to_hub - ) \ No newline at end of file + args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub + ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 94090841784322..69264d51b5e6b0 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -40,13 +40,11 @@ requires_backends, ) from ...modeling_outputs import ( - BaseModelOutput, - BaseModelOutputWithPoolingAndCrossAttentions, - BaseModelOutputWithPastAndCrossAttentions + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...pytorch_utils import meshgrid +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone from .configuration_grounding_dino import GroundingDINOConfig @@ -135,7 +133,6 @@ def backward(context, grad_output): ] - @dataclass class GroundingDINODecoderOutput(ModelOutput): """ @@ -177,11 +174,11 @@ class GroundingDINODecoderOutput(ModelOutput): vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + @dataclass class GroundingDINOEncoderOutput(ModelOutput): """ - Base class for outputs of the GroundingDINOEncoder. This class extends - BaseModelOutput, due to: + Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to: - vision and text last hidden states - vision and text intermediate hidden states - vision and text attentions @@ -193,30 +190,31 @@ class GroundingDINOEncoderOutput(ModelOutput): last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the text encoder. hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer - plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer - plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. """ + last_hidden_state_vision: torch.FloatTensor = None last_hidden_state_text: torch.FloatTensor = None hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None @@ -262,29 +260,29 @@ class GroundingDINOModelOutput(ModelOutput): encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. @@ -359,29 +357,29 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. encoder_hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each - layer plus the initial embedding outputs. + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, used to compute the weighted average in - the multi-scale deformable attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, + used to compute the weighted average in the multi-scale deformable attention heads. encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder, after the attention softmax, used to compute the weighted average in - the self-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, + used to compute the weighted average in the self-attention heads. encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, - used to compute the weighted average in the bi-attention heads. + Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + attention softmax, used to compute the weighted average in the bi-attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -618,8 +616,8 @@ def forward(self, pixel_values, pixel_mask): x_embed = pixel_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) @@ -660,7 +658,9 @@ def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": # TODO find a better way of exposing other arguments - position_embedding = GroundingDINOSinePositionEmbedding(n_steps, config.positional_embedding_temperature, normalize=True) + position_embedding = GroundingDINOSinePositionEmbedding( + n_steps, config.positional_embedding_temperature, normalize=True + ) elif config.position_embedding_type == "learned": position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) else: @@ -841,17 +841,19 @@ def forward( return output, attention_weights -#TODO is this an approriate way to name this? + +# TODO is this an approriate way to name this? class GroundingDINOTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" + def __init__(self, config): super().__init__() self.self_attn = nn.MultiheadAttention( - embed_dim=config.d_model, - num_heads=config.encoder_attention_heads // 2, + embed_dim=config.d_model, + num_heads=config.encoder_attention_heads // 2, dropout=config.text_enhancer_dropout, batch_first=True, - ) + ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) @@ -871,18 +873,14 @@ def forward( hidden_states: Tensor, attention_masks: Optional[Tensor] = None, position_embeddings: Optional[Tensor] = None, - ): # repeat attn mask + ): # repeat attn mask if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) q = k = self.with_pos_embed(hidden_states, position_embeddings) attention_output, attention_weights = self.self_attn( - query=q, - key=k, - value=hidden_states, - attn_mask=attention_masks, - average_attn_weights=False + query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output @@ -897,16 +895,10 @@ def forward( hidden_states = self.layer_norm_after(hidden_states) return hidden_states, attention_weights - + + class GroundingDINOBiMultiHeadAttention(nn.Module): - def __init__( - self, - vision_dim: int, - text_dim: int, - embed_dim: int, - num_heads: int, - dropout:float = 0.1 - ): + def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1): super().__init__() self.embed_dim = embed_dim @@ -949,12 +941,12 @@ def _reset_parameters(self): self.out_text_proj.bias.data.fill_(0) def forward( - self, - vision_features: Tensor, - text_features: Tensor, - vision_attention_mask: Optional[Tensor] = None, - text_attention_mask: Optional[Tensor] = None - ): + self, + vision_features: Tensor, + text_features: Tensor, + vision_attention_mask: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + ): """_summary_ Args: @@ -1000,21 +992,21 @@ def forward( attn_weights = attn_weights - attn_weights.max() attn_weights = torch.clamp( - attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range attn_weights = torch.clamp( - attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose(1, 2) text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] - + text_attn_weights = torch.clamp( - text_attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range + text_attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range text_attn_weights = torch.clamp( - text_attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + text_attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range # mask vison for language if vision_attention_mask is not None: @@ -1027,9 +1019,7 @@ def forward( # mask language for vision if text_attention_mask is not None: - text_attention_mask = ( - text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) - ) + text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) attn_weights.masked_fill_(text_attention_mask, float("-inf")) vision_attn_weights = attn_weights.softmax(dim=-1) @@ -1062,6 +1052,7 @@ def forward( return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) + # Copied from transformers.models.beit.modeling_beit.drop_path def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: """ @@ -1082,6 +1073,7 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals output = input.div(keep_prob) * random_tensor return output + # Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO class GroundingDINODropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" @@ -1095,6 +1087,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def extra_repr(self) -> str: return "p={}".format(self.drop_prob) + + class GroundingDINOFusionLayer(nn.Module): def __init__(self, config, init_values=1e-4): super().__init__() @@ -1104,11 +1098,11 @@ def __init__(self, config, init_values=1e-4): self.layer_norm_vision = nn.LayerNorm(config.d_model) self.layer_norm_text = nn.LayerNorm(config.d_model) self.attn = GroundingDINOBiMultiHeadAttention( - vision_dim=config.d_model, - text_dim=config.d_model, - embed_dim=config.encoder_ffn_dim // 2, - num_heads=config.encoder_attention_heads // 2, - dropout=config.fusion_dropout + vision_dim=config.d_model, + text_dim=config.d_model, + embed_dim=config.encoder_ffn_dim // 2, + num_heads=config.encoder_attention_heads // 2, + dropout=config.fusion_dropout, ) # add layer scale for training stability @@ -1120,17 +1114,18 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at vision_features = self.layer_norm_vision(vision_features) text_features = self.layer_norm_text(text_features) (delta_v, vision_attn), (delta_t, text_attn) = self.attn( - vision_features, - text_features, - vision_attention_mask=attention_mask_vision, - text_attention_mask=attention_mask_text + vision_features, + text_features, + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text, ) vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) text_features = text_features + self.drop_path(self.gamma_l * delta_t) return (vision_features, vision_attn), (text_features, text_attn) -#NOTE just renamed the class + +# NOTE just renamed the class class GroundingDINODeformableLayer(nn.Module): def __init__(self, config: GroundingDINOConfig): super().__init__() @@ -1210,12 +1205,13 @@ def forward( return hidden_states, attn_weights + def get_sine_pos_embed( pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True, - ) -> Tensor: +) -> Tensor: """generate sine position embedding from a position tensor Args: pos_tensor (torch.Tensor): shape: [..., n]. @@ -1250,26 +1246,19 @@ def __init__(self, config) -> None: self.deformable_layer = GroundingDINODeformableLayer(config) def get_text_position_embeddings( - self, - text_features: Tensor, - text_position_embedding: Tensor, - text_position_ids: Tensor - ) -> Tensor: + self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor + ) -> Tensor: bs, n_text, text_dim = text_features.shape if text_position_embedding is None and text_position_ids is None: text_position_embedding = ( - torch.arange(n_text, device=text_features.device) - .float() - .unsqueeze(0) - .unsqueeze(-1) - .repeat(bs, 1, 1) + torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1) ) text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) if text_position_ids is not None: text_position_embedding = get_sine_pos_embed( text_position_ids[..., None], num_pos_feats=256, exchange_xy=False ) - + return text_position_embedding def forward( @@ -1284,12 +1273,10 @@ def forward( text_attention_mask: Optional[Tensor] = None, text_position_embedding: Optional[Tensor] = None, text_self_attention_masks: Optional[Tensor] = None, - text_position_ids: Optional[Tensor] = None + text_position_ids: Optional[Tensor] = None, ): text_position_embedding = self.get_text_position_embeddings( - text_features, - text_position_embedding, - text_position_ids + text_features, text_position_embedding, text_position_ids ) (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( @@ -1302,7 +1289,7 @@ def forward( (text_features, text_enhanced_attn) = self.text_enhancer_layer( hidden_states=text_features, attention_masks=~text_self_attention_masks, # note we use ~ for mask here - position_embeddings=(text_position_embedding if text_position_embedding is not None else None) + position_embeddings=(text_position_embedding if text_position_embedding is not None else None), ) (vision_features, vision_deformable_attn) = self.deformable_layer( @@ -1315,8 +1302,8 @@ def forward( ) return ( - (vision_features, text_features), - (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn) + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn), ) @@ -1330,7 +1317,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - batch_first=True + batch_first=True, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] @@ -1342,7 +1329,7 @@ def __init__(self, config: GroundingDINOConfig): embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - batch_first=True + batch_first=True, ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention @@ -1400,11 +1387,7 @@ def forward( # Self Attention q = k = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, self_attn_weights = self.self_attn( - query=q, - key=k, - value=hidden_states, - attn_mask=self_attn_mask, - average_attn_weights=False + query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1419,7 +1402,7 @@ def forward( key=text_encoder_hidden_states, value=text_encoder_hidden_states, key_padding_mask=text_encoder_attention_mask, - average_attn_weights=False + average_attn_weights=False, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -1462,19 +1445,18 @@ def forward( return outputs + class GroundingDINOContrastiveEmbedding(nn.Module): def __init__(self, config): super().__init__() self.max_text_len = config.max_text_len def forward( - self, - vision_hidden_state: torch.FloatTensor, - text_hidden_state: torch.FloatTensor, - text_token_mask: torch.BoolTensor - ) -> torch.FloatTensor: - - + self, + vision_hidden_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor, + ) -> torch.FloatTensor: output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) output.masked_fill_(~text_token_mask[:, None, :], float("-inf")) @@ -1484,6 +1466,7 @@ def forward( return new_output + # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead class GroundingDINOClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" @@ -1503,30 +1486,29 @@ def forward(self, hidden_states: torch.Tensor): return hidden_states -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->GroundingDINO class GroundingDINOPreTrainedModel(PreTrainedModel): config_class = GroundingDINOConfig base_model_prefix = "model" main_input_name = "pixel_values" def _init_weights(self, module): - std = self.config.init_std - if isinstance(module, GroundingDINOLearnedPositionEmbedding): nn.init.uniform_(module.row_embeddings.weight) nn.init.uniform_(module.column_embeddings.weight) elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): module._reset_parameters() - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GroundingDINOBiMultiHeadAttention): + module._reset_parameters() + elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + elif isinstance(module, GroundingDINOModel): + nn.init.constant_(module.input_proj_text.bias.data, 0) + nn.init.xavier_uniform_(module.input_proj_text.weight.data) + for proj in module.input_proj_vision: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) if hasattr(module, "reference_points") and not self.config.two_stage: nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) nn.init.constant_(module.reference_points.bias.data, 0.0) @@ -1743,9 +1725,8 @@ def forward( text_attention_mask=text_attention_mask, text_position_embedding=text_position_embedding, text_self_attention_masks=text_self_attention_masks, - text_position_ids=text_position_ids - ) - + text_position_ids=text_position_ids, + ) if output_attentions: all_attn_fused_vision += (attentions[0],) @@ -1759,9 +1740,12 @@ def forward( if not return_dict: enc_outputs = [ - vision_features, text_features, - all_attn_fused_vision, all_attn_fused_text, - all_attn_enhanced_text, all_attn_deformable + vision_features, + text_features, + all_attn_fused_vision, + all_attn_fused_text, + all_attn_enhanced_text, + all_attn_deformable, ] return tuple(v for v in enc_outputs if v is not None) return GroundingDINOEncoderOutput( @@ -1772,9 +1756,10 @@ def forward( cross_attentions_vision=all_attn_fused_vision, cross_attentions_text=all_attn_fused_text, attentions_vision=all_attn_deformable, - attentions_text=all_attn_enhanced_text + attentions_text=all_attn_enhanced_text, ) + class GroundingDINODecoder(GroundingDINOPreTrainedModel): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. @@ -1797,10 +1782,7 @@ def __init__(self, config: GroundingDINOConfig): self.layer_norm = nn.LayerNorm(config.d_model) self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) self.reference_points_head = GroundingDINOMLPPredictionHead( - config.query_dim // 2 * config.d_model, - config.d_model, - config.d_model, - 2 + config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 ) self.gradient_checkpointing = False @@ -1826,7 +1808,7 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen # batch_size, num_queries, num_pos_feats pos_x = pos_x[:, :, None] / dim_t pos_y = pos_y[:, :, None] / dim_t - # batch_size, num_queries, num_pos_feats + # batch_size, num_queries, num_pos_feats pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) @@ -1849,8 +1831,6 @@ def get_proposal_pos_embed(self, proposals: torch.FloatTensor) -> torch.FloatTen raise ValueError("Unknown proposals shape(-1):{}".format(proposals.size(-1))) return pos - - def forward( self, inputs_embeds, @@ -1959,7 +1939,7 @@ def custom_forward(*inputs): text_encoder_hidden_states=text_encoder_hidden_states, text_encoder_attention_mask=text_encoder_attention_mask, self_attn_mask=self_attn_mask, - output_attentions=output_attentions + output_attentions=output_attentions, ) hidden_states = layer_outputs[0] @@ -1992,7 +1972,6 @@ def custom_forward(*inputs): if vision_encoder_hidden_states is not None: all_cross_attns_vision += (layer_outputs[3],) - # Keep batch_size as first dimension intermediate = torch.stack(intermediate, dim=1) intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) @@ -2012,7 +1991,7 @@ def custom_forward(*inputs): all_hidden_states, all_self_attns, all_cross_attns_vision, - all_cross_attns_text + all_cross_attns_text, ] if v is not None ) @@ -2023,7 +2002,7 @@ def custom_forward(*inputs): hidden_states=all_hidden_states, attentions=all_self_attns, vision_cross_attentions=all_cross_attns_vision, - text_cross_attentions=all_cross_attns_text + text_cross_attentions=all_cross_attns_text, ) @@ -2075,7 +2054,7 @@ def __init__(self, config: GroundingDINOConfig): ) # Create text backbone - self.text_backbone = GroundingDINOTextModel(config.text_backbone_config) + self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config) self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: @@ -2199,7 +2178,7 @@ def forward( text_token_mask: Tensor, text_self_attention_masks: Tensor, position_ids: Tensor, - pixel_mask: Optional[Tensor]=None, + pixel_mask: Optional[Tensor] = None, encoder_outputs=None, output_attentions=None, output_hidden_states=None, @@ -2236,7 +2215,9 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Extract text features from text backbone - text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)["last_hidden_state"] + text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[ + "last_hidden_state" + ] text_features = self.input_proj_text(text_features) batch_size, num_channels, height, width = pixel_values.shape @@ -2319,7 +2300,7 @@ def forward( text_position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict + return_dict=return_dict, ) # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput): @@ -2346,9 +2327,7 @@ def forward( # apply a detection head to each pixel (A.4 in paper) # linear projection for bounding box binary classification (i.e. foreground and background) enc_outputs_class = self.encoder_output_class_embed( - object_query_embedding, - encoder_outputs[1], - text_token_mask + object_query_embedding, encoder_outputs[1], text_token_mask ) # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) @@ -2389,7 +2368,7 @@ def forward( self_attn_mask=None, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict + return_dict=return_dict, ) if not return_dict: @@ -2422,8 +2401,8 @@ def forward( @add_start_docstrings( """ - Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on - top, for tasks such as COCO detection. + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. """, GROUNDING_DINO_START_DOCSTRING, ) @@ -2446,13 +2425,12 @@ def __init__(self, config: GroundingDINOConfig): nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) - if config.decoder_bbox_embed_share: self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) else: self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) - # hack implementation for two-stage + # hack implementation for two-stage self.model.decoder.bbox_embed = self.bbox_embed self.model.decoder.class_embed = self.class_embed @@ -2461,8 +2439,8 @@ def __init__(self, config: GroundingDINOConfig): self.model.encoder_output_bbox_embed = _bbox_embed else: self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed) - - #TODO don't believe this is necessary since class_embed has no parameters + + # TODO don't believe this is necessary since class_embed has no parameters if config.two_stage_class_embed_share: self.model.encoder_output_class_embed = _class_embed else: @@ -2490,12 +2468,12 @@ def forward( text_token_mask: torch.BoolTensor, text_self_attention_masks: torch.BoolTensor, position_ids: torch.LongTensor, - pixel_mask: Optional[torch.BoolTensor]=None, - encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]]=None, - labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, ): r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2541,14 +2519,14 @@ def forward( # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( - pixel_values=pixel_values , - input_ids=input_ids , - attention_mask=attention_mask , - token_type_ids=token_type_ids , - text_token_mask=text_token_mask , - text_self_attention_masks=text_self_attention_masks , - position_ids=position_ids , - pixel_mask=pixel_mask , + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + position_ids=position_ids, + pixel_mask=pixel_mask, encoder_outputs=encoder_outputs, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -2573,8 +2551,8 @@ def forward( outputs_class = self.class_embed[level]( vision_hidden_state=hidden_states[:, level], text_hidden_state=enc_text_hidden_state, - text_token_mask=text_token_mask - ) + text_token_mask=text_token_mask, + ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: outputs_coord_logits = delta_bbox + reference @@ -3117,6 +3095,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): raise ValueError("Only 3-dimensional tensors are supported") return NestedTensor(tensor, mask) + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText class GroundingDINOTextEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" @@ -3181,8 +3160,10 @@ def forward( embeddings = self.dropout(embeddings) return embeddings + # Classes for Text Backbone (It's just a BERT model) + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText class GroundingDINOTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3317,6 +3298,7 @@ def forward( outputs = outputs + (past_key_value,) return outputs + # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText class GroundingDINOTextSelfOutput(nn.Module): def __init__(self, config): @@ -3331,6 +3313,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText class GroundingDINOTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): @@ -3380,6 +3363,7 @@ def forward( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText class GroundingDINOTextIntermediate(nn.Module): def __init__(self, config): @@ -3395,6 +3379,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText class GroundingDINOTextOutput(nn.Module): def __init__(self, config): @@ -3409,6 +3394,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText class GroundingDINOTextLayer(nn.Module): def __init__(self, config): @@ -3495,6 +3481,7 @@ def feed_forward_chunk(self, attention_output): layer_output = self.output(intermediate_output, attention_output) return layer_output + # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText class GroundingDINOTextEncoder(nn.Module): def __init__(self, config): @@ -3593,6 +3580,7 @@ def custom_forward(*inputs): cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText class GroundingDINOTextPooler(nn.Module): def __init__(self, config): @@ -3608,7 +3596,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: pooled_output = self.activation(pooled_output) return pooled_output -class GroundingDINOTextModel(PreTrainedModel): + +class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 22f24222f67514..21ce436a8c4935 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2486,30 +2486,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None - - -class GroundingDINOForObjectDetection(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class GroundingDINOModel(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class GroundingDINOPreTrainedModel(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None @@ -4005,6 +3981,30 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GroundingDINOForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDINOPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/utils/check_repo.py b/utils/check_repo.py index 85cf36eeacb1b7..95ab142fa0b7f9 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -223,6 +223,7 @@ "FlavaMultimodalModel", "GPT2DoubleHeadsModel", "GPTSw3DoubleHeadsModel", + "GroundingDINOTextPrenet", "InstructBlipVisionModel", "InstructBlipQFormerModel", "LayoutLMForQuestionAnswering", From 685f1d66b3656087515b97185efab8017f39d71c Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 13:45:13 -0300 Subject: [PATCH 070/252] Now text_backbone_config has its own class --- .../configuration_grounding_dino.py | 119 ++++++++++++++++-- 1 file changed, 111 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 09b9c41f131964..a3aa2b733d0474 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -25,6 +25,115 @@ "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } +# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet +class GroundingDINOTextPrenetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a + [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or + [`TFGroundingDINOTextPrenetModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = GroundingDINOTextPrenetConfig() + + >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration + >>> model = GroundingDINOTextPrenetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "grounding-dino-text-prenet" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + class GroundingDINOConfig(PretrainedConfig): r""" @@ -177,7 +286,7 @@ def __init__( self, use_timm_backbone=False, backbone_config={"model_type": "swin"}, - text_backbone_config="bert-base-uncased", + text_backbone_config=None, num_channels=3, num_queries=900, max_position_embeddings=1024, @@ -187,15 +296,12 @@ def __init__( decoder_layers=6, decoder_ffn_dim=2048, decoder_attention_heads=8, - encoder_layerdrop=0.0, is_encoder_decoder=True, activation_function="relu", d_model=256, dropout=0.1, attention_dropout=0.0, activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", @@ -259,9 +365,6 @@ def __init__( self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop self.auxiliary_loss = auxiliary_loss self.position_embedding_type = position_embedding_type self.backbone = backbone @@ -289,7 +392,7 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = AutoConfig.from_pretrained(text_backbone_config) + self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config self.max_text_len = max_text_len self.sub_sentence_present = sub_sentence_present # Text Enhancer From d6e88fcf7d9f8c9cd009c85efc62e028b92e96fb Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 13:47:56 -0300 Subject: [PATCH 071/252] Modified convert script --- .../convert_grounding_dino_to_hf.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 4f2f3716329ed4..29ad93f70ab536 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -374,7 +374,7 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token return attention_mask, position_ids.to(torch.long) - tokenizer = AutoTokenizer.from_pretrained(config.text_backbone_config._name_or_path) + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") @@ -401,12 +401,21 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token @torch.no_grad() -def convert_grounding_dino_checkpoint( - model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str = None, push_to_hub: bool = False -): +def convert_grounding_dino_checkpoint(args): + + model_name = args.model_name + pytorch_dump_folder_path = args.pytorch_dump_folder_path + push_to_hub = args.push_to_hub + + checkpoint_mapping = { + "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", + "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", + } # Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) + checkpoint_path = checkpoint_mapping[model_name] + # Load original checkpoint original_state_dict = torch.load(checkpoint_path, map_location="cpu") @@ -432,7 +441,7 @@ def convert_grounding_dino_checkpoint( text_inputs, text_token_mask = text_processor(text, config) # Running forward - model( + output = model( pixel_values=image_inputs.unsqueeze(0), input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"], @@ -451,8 +460,11 @@ def convert_grounding_dino_checkpoint( if push_to_hub: print(f"Pushing model and image processor for {model_name} to hub") - model.push_to_hub(f"microsoft/{model_name}") - image_processor.push_to_hub(f"microsoft/{model_name}") + model.push_to_hub(f"EduardoPacheco/{model_name}") + #TODO push image processor to hub + # image_processor.push_to_hub(f"microsoft/{model_name}") + #TODO push tokenizer to hub + #TODO push processor to hub if __name__ == "__main__": @@ -460,17 +472,17 @@ def convert_grounding_dino_checkpoint( # Required parameters parser.add_argument( "--model_name", - default="grounding-dino-tiny", + default="grounding-dino-base", type=str, choices=["grounding-dino-tiny", "grounding-dino-base"], help="Name of the GroundingDINO model you'd like to convert.", ) - parser.add_argument( - "--checkpoint_path", - default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", - type=str, - help="Path to the original PyTorch checkpoint (.pth file).", - ) + # parser.add_argument( + # "--checkpoint_path", + # default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", + # type=str, + # help="Path to the original PyTorch checkpoint (.pth file).", + # ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." ) @@ -479,6 +491,4 @@ def convert_grounding_dino_checkpoint( ) args = parser.parse_args() - convert_grounding_dino_checkpoint( - args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub - ) + convert_grounding_dino_checkpoint(args) From 0242e57c848e684e5b4408ce8e20a02439241a0b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 6 Oct 2023 15:01:44 -0300 Subject: [PATCH 072/252] Removed unnecessary config attribute --- .../configuration_grounding_dino.py | 2 -- .../convert_grounding_dino_to_hf.py | 21 ++++--------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index a3aa2b733d0474..fbd0d483b48e45 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -326,7 +326,6 @@ def __init__( disable_custom_kernels=False, # other parameters max_text_len=256, - sub_sentence_present=True, text_enhancer_dropout=0.0, fusion_droppath=0.1, fusion_dropout=0.0, @@ -394,7 +393,6 @@ def __init__( # Text backbone self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config self.max_text_len = max_text_len - self.sub_sentence_present = sub_sentence_present # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout # Fusion diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 29ad93f70ab536..ed16da3f0c4617 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -347,7 +347,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token # generate attention mask and positional ids attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) position_ids = torch.zeros((bs, num_token), device=input_ids.device) - cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 for i in range(idxs.shape[0]): row, col = idxs[i] @@ -359,18 +358,8 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token position_ids[row, previous_col + 1 : col + 1] = torch.arange( 0, col - previous_col, device=input_ids.device ) - c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() - c2t_maski[previous_col + 1 : col] = True - cate_to_token_mask_list[row].append(c2t_maski) - previous_col = col - - cate_to_token_mask_list = [ - torch.stack(cate_to_token_mask_listi, dim=0) for cate_to_token_mask_listi in cate_to_token_mask_list - ] - # # padding mask - # padding_mask = tokenized['attention_mask'] - # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + previous_col = col return attention_mask, position_ids.to(torch.long) @@ -383,7 +372,6 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token ) max_text_len = config.max_text_len - sub_sentence_present = config.sub_sentence_present if text_self_attention_masks.shape[1] > max_text_len: text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] position_ids = position_ids[:, :max_text_len] @@ -392,10 +380,9 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] # extract text embeddings - if sub_sentence_present: - tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} - tokenized_for_encoder["attention_mask"] = text_self_attention_masks - tokenized_for_encoder["position_ids"] = position_ids + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids return tokenized_for_encoder, tokenized.attention_mask.bool() From af06c85c5e471c60b705e7e2e48522f9763a67d9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:06:12 -0300 Subject: [PATCH 073/252] Added new function to generate sub sentence mask --- .../grounding_dino/modeling_grounding_dino.py | 76 +++++++++++++++---- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 69264d51b5e6b0..d75db4735ad30a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -47,7 +47,7 @@ from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone -from .configuration_grounding_dino import GroundingDINOConfig +from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig from .load_custom import load_cuda_kernels @@ -1923,9 +1923,16 @@ def custom_forward(*inputs): layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, + query_pos, + reference_points_input, + spatial_shapes, + level_start_index, vision_encoder_hidden_states, vision_encoder_attention_mask, - None, + text_encoder_hidden_states, + text_encoder_attention_mask, + self_attn_mask, + None ) else: layer_outputs = decoder_layer( @@ -2005,6 +2012,42 @@ def custom_forward(*inputs): text_cross_attentions=all_cross_attns_text, ) +SPECIAL_TOKENS = [101, 102, 1012, 1029] +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (torch.LongTensor): input ids. Shape: [bs, num_token] + Returns: + Tuple[torch.Tensor]: attention mask between each special tokens and position_ids + """ + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + return attention_mask, position_ids.to(torch.long) + @add_start_docstrings( """ @@ -2173,11 +2216,8 @@ def forward( self, pixel_values: Tensor, input_ids: Tensor, - attention_mask: Tensor, token_type_ids: Tensor, - text_token_mask: Tensor, - text_self_attention_masks: Tensor, - position_ids: Tensor, + attention_mask: Tensor, pixel_mask: Optional[Tensor] = None, encoder_outputs=None, output_attentions=None, @@ -2214,8 +2254,19 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] + # Extract text features from text backbone - text_features = self.text_backbone(input_ids, attention_mask, token_type_ids, position_ids)[ + text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[ "last_hidden_state" ] text_features = self.input_proj_text(text_features) @@ -2463,11 +2514,8 @@ def forward( self, pixel_values: torch.FloatTensor, input_ids: torch.LongTensor, - attention_mask: torch.BoolTensor, + attention_mask: torch.LongTensor, token_type_ids: torch.LongTensor, - text_token_mask: torch.BoolTensor, - text_self_attention_masks: torch.BoolTensor, - position_ids: torch.LongTensor, pixel_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, @@ -2523,9 +2571,6 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, - text_token_mask=text_token_mask, - text_self_attention_masks=text_self_attention_masks, - position_ids=position_ids, pixel_mask=pixel_mask, encoder_outputs=encoder_outputs, output_attentions=output_attentions, @@ -2551,7 +2596,7 @@ def forward( outputs_class = self.class_embed[level]( vision_hidden_state=hidden_states[:, level], text_hidden_state=enc_text_hidden_state, - text_token_mask=text_token_mask, + text_token_mask=attention_mask.bool(), ) delta_bbox = self.bbox_embed[level](hidden_states[:, level]) if reference.shape[-1] == 4: @@ -3609,6 +3654,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ + config_class = GroundingDINOTextPrenetConfig def __init__(self, config, add_pooling_layer=True): super().__init__(config) From 43c0ce572c21c3a14c78f7170cd211e4a880e493 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:26:14 -0300 Subject: [PATCH 074/252] Renamed parameters with gamma in the name as it's currently not allowed --- .../models/grounding_dino/modeling_grounding_dino.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index d75db4735ad30a..71e7cb33fba0b9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1107,8 +1107,8 @@ def __init__(self, config, init_values=1e-4): # add layer scale for training stability self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.gamma_v = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) - self.gamma_l = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): vision_features = self.layer_norm_vision(vision_features) @@ -1119,8 +1119,8 @@ def forward(self, vision_features, text_features, attention_mask_vision=None, at vision_attention_mask=attention_mask_vision, text_attention_mask=attention_mask_text, ) - vision_features = vision_features + self.drop_path(self.gamma_v * delta_v) - text_features = text_features + self.drop_path(self.gamma_l * delta_t) + vision_features = vision_features + self.drop_path(self.vision_param * delta_v) + text_features = text_features + self.drop_path(self.text_param * delta_t) return (vision_features, vision_attn), (text_features, text_attn) From 2bb7b70eaffcf4520013859a77fae3418985d18f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:27:04 -0300 Subject: [PATCH 075/252] Removed tokenization and image_processing scripts since we'll map from existing models --- .../image_processing_grounding_dino.py | 967 ------------------ .../tokenization_grounding_dino.py | 0 2 files changed, 967 deletions(-) delete mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py delete mode 100644 src/transformers/models/grounding_dino/tokenization_grounding_dino.py diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py deleted file mode 100644 index 1adf8e8e0dcd62..00000000000000 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ /dev/null @@ -1,967 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Image processor class for Deformable DETR.""" - -import io -import pathlib -from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union - -import numpy as np - -from ...feature_extraction_utils import BatchFeature -from ...image_processing_utils import BaseImageProcessor, get_size_dict -from ...image_transforms import ( - PaddingMode, - center_to_corners_format, - corners_to_center_format, - id_to_rgb, - pad, - rescale, - resize, - rgb_to_id, - to_channel_dimension_format, -) -from ...image_utils import ( - IMAGENET_DEFAULT_MEAN, - IMAGENET_DEFAULT_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_coco_detection_annotations, - valid_images, -) -from ...utils import ( - ExplicitEnum, - TensorType, - is_flax_available, - is_jax_tensor, - is_scipy_available, - is_tf_available, - is_tf_tensor, - is_torch_available, - is_torch_tensor, - is_vision_available, - logging, -) - - -if is_torch_available(): - import torch - from torch import nn - - -if is_vision_available(): - import PIL - -if is_scipy_available(): - import scipy.special - import scipy.stats - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -AnnotationType = Dict[str, Union[int, str, List[Dict]]] - - -class AnnotionFormat(ExplicitEnum): - COCO_DETECTION = "coco_detection" - - -SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION) - - -# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio -def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: - """ - Computes the output image size given the input image size and the desired output size. - - Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): - The desired output size. - max_size (`int`, *optional*): - The maximum allowed output size. - """ - height, width = image_size - if max_size is not None: - min_original_size = float(min((height, width))) - max_original_size = float(max((height, width))) - if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) - - if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: - ow = size - oh = int(size * height / width) - else: - oh = size - ow = int(size * width / height) - return (oh, ow) - - -# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size -def get_resize_output_image_size( - input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int]], - max_size: Optional[int] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, -) -> Tuple[int, int]: - """ - Computes the output image size given the input image size and the desired output size. If the desired output size - is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output - image size is computed by keeping the aspect ratio of the input image size. - - Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): - The desired output size. - max_size (`int`, *optional*): - The maximum allowed output size. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred from the input image. - """ - image_size = get_image_size(input_image, input_data_format) - if isinstance(size, (list, tuple)): - return size - - return get_size_with_aspect_ratio(image_size, size, max_size) - - -# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn -def get_numpy_to_framework_fn(arr) -> Callable: - """ - Returns a function that converts a numpy array to the framework of the input array. - - Args: - arr (`np.ndarray`): The array to convert. - """ - if isinstance(arr, np.ndarray): - return np.array - if is_tf_available() and is_tf_tensor(arr): - import tensorflow as tf - - return tf.convert_to_tensor - if is_torch_available() and is_torch_tensor(arr): - import torch - - return torch.tensor - if is_flax_available() and is_jax_tensor(arr): - import jax.numpy as jnp - - return jnp.array - raise ValueError(f"Cannot convert arrays of type {type(arr)}") - - -# Copied from transformers.models.detr.image_processing_detr.safe_squeeze -def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: - """ - Squeezes an array, but only if the axis specified has dim 1. - """ - if axis is None: - return arr.squeeze() - - try: - return arr.squeeze(axis=axis) - except ValueError: - return arr - - -# Copied from transformers.models.detr.image_processing_detr.normalize_annotation -def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: - image_height, image_width = image_size - norm_annotation = {} - for key, value in annotation.items(): - if key == "boxes": - boxes = value - boxes = corners_to_center_format(boxes) - boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) - norm_annotation[key] = boxes - else: - norm_annotation[key] = value - return norm_annotation - - -# Copied from transformers.models.detr.image_processing_detr.max_across_indices -def max_across_indices(values: Iterable[Any]) -> List[Any]: - """ - Return the maximum value across all indices of an iterable of values. - """ - return [max(values_i) for values_i in zip(*values)] - - -# Copied from transformers.models.detr.image_processing_detr.get_max_height_width -def get_max_height_width( - images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None -) -> List[int]: - """ - Get the maximum height and width across all images in a batch. - """ - if input_data_format is None: - input_data_format = infer_channel_dimension_format(images[0]) - - if input_data_format == ChannelDimension.FIRST: - _, max_height, max_width = max_across_indices([img.shape for img in images]) - elif input_data_format == ChannelDimension.LAST: - max_height, max_width, _ = max_across_indices([img.shape for img in images]) - else: - raise ValueError(f"Invalid channel dimension format: {input_data_format}") - return (max_height, max_width) - - -# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask -def make_pixel_mask( - image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None -) -> np.ndarray: - """ - Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. - - Args: - image (`np.ndarray`): - Image to make the pixel mask for. - output_size (`Tuple[int, int]`): - Output size of the mask. - """ - input_height, input_width = get_image_size(image, channel_dim=input_data_format) - mask = np.zeros(output_size, dtype=np.int64) - mask[:input_height, :input_width] = 1 - return mask - -def prepare_coco_detection_annotation( - image, - target, - input_data_format: Optional[Union[ChannelDimension, str]] = None, -): - """ - Convert the target in COCO format into the format expected by GroundingDINO. - """ - image_height, image_width = get_image_size(image, channel_dim=input_data_format) - - image_id = target["image_id"] - image_id = np.asarray([image_id], dtype=np.int64) - - # Get all COCO annotations for the given image. - annotations = target["annotations"] - annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] - - classes = [obj["category_id"] for obj in annotations] - classes = np.asarray(classes, dtype=np.int64) - - # for conversion to coco api - area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) - iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) - - boxes = [obj["bbox"] for obj in annotations] - # guard against no boxes via resizing - boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) - boxes[:, 2:] += boxes[:, :2] - boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) - boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) - - keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) - - new_target = {} - new_target["image_id"] = image_id - new_target["class_labels"] = classes[keep] - new_target["boxes"] = boxes[keep] - new_target["area"] = area[keep] - new_target["iscrowd"] = iscrowd[keep] - new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) - - if annotations and "keypoints" in annotations[0]: - keypoints = [obj["keypoints"] for obj in annotations] - keypoints = np.asarray(keypoints, dtype=np.float32) - num_keypoints = keypoints.shape[0] - keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints - new_target["keypoints"] = keypoints[keep] - - return new_target - -# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities -def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - probs = scipy.special.softmax(logits, axis=-1) - labels = probs.argmax(-1, keepdims=True) - scores = np.take_along_axis(probs, labels, axis=-1) - scores, labels = scores.squeeze(-1), labels.squeeze(-1) - return scores, labels - -# Copied from transformers.models.detr.image_processing_detr.resize_annotation -def resize_annotation( - annotation: Dict[str, Any], - orig_size: Tuple[int, int], - target_size: Tuple[int, int], - threshold: float = 0.5, - resample: PILImageResampling = PILImageResampling.NEAREST, -): - """ - Resizes an annotation to a target size. - - Args: - annotation (`Dict[str, Any]`): - The annotation dictionary. - orig_size (`Tuple[int, int]`): - The original size of the input image. - target_size (`Tuple[int, int]`): - The target size of the image, as returned by the preprocessing `resize` step. - threshold (`float`, *optional*, defaults to 0.5): - The threshold used to binarize the segmentation masks. - resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): - The resampling filter to use when resizing the masks. - """ - ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) - ratio_height, ratio_width = ratios - - new_annotation = {} - new_annotation["size"] = target_size - - for key, value in annotation.items(): - if key == "boxes": - boxes = value - scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) - new_annotation["boxes"] = scaled_boxes - elif key == "area": - area = value - scaled_area = area * (ratio_width * ratio_height) - new_annotation["area"] = scaled_area - elif key == "masks": - masks = value[:, None] - masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) - masks = masks.astype(np.float32) - masks = masks[:, 0] > threshold - new_annotation["masks"] = masks - elif key == "size": - new_annotation["size"] = target_size - else: - new_annotation[key] = value - - return new_annotation - - -class GroundingDINOImageProcessor(BaseImageProcessor): - r""" - Constructs a Grounding DINO image processor. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be - overridden by the `do_resize` parameter in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): - Resampling filter to use if resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the - `do_rescale` parameter in the `preprocess` method. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the - `preprocess` method. - do_normalize: - Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the - `preprocess` method. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): - Mean values to use when normalizing the image. Can be a single value or a list of values, one for each - channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): - Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one - for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. - do_pad (`bool`, *optional*, defaults to `True`): - Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be - overridden by the `do_pad` parameter in the `preprocess` method. - """ - - model_input_names = ["pixel_values", "pixel_mask"] - - def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BILINEAR, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Union[float, List[float]] = None, - image_std: Union[float, List[float]] = None, - do_pad: bool = True, - **kwargs, - ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - - size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} - size = get_size_dict(size, max_size=max_size, default_to_square=False) - - super().__init__(**kwargs) - self.format = format - self.do_resize = do_resize - self.size = size - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN - self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad - - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO - def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - - def prepare_annotation( - self, - image: np.ndarray, - target: Dict, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> Dict: - """ - Prepare an annotation for feeding into Grounding DINO model. - """ - target = prepare_coco_detection_annotation( - image, target, input_data_format=input_data_format - ) - - return target - - def prepare(self, image, target): - logger.warning_once( - "The `prepare` method is deprecated and will be removed in a v4.33. " - "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " - "does not return the image anymore.", - ) - target = self.prepare_annotation(image, target) - return image, target - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize - def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BILINEAR, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> np.ndarray: - """ - Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an - int, smaller edge of the image will be matched to this number. - - Args: - image (`np.ndarray`): - Image to resize. - size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): - Resampling filter to use if resizing the image. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) - if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( - image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format - ) - elif "height" in size and "width" in size: - size = (size["height"], size["width"]) - else: - raise ValueError( - "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" - f" {size.keys()}." - ) - image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs - ) - return image - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation - def resize_annotation( - self, - annotation, - orig_size, - size, - resample: PILImageResampling = PILImageResampling.NEAREST, - ) -> Dict: - """ - Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched - to this number. - """ - return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale - def rescale( - self, - image: np.ndarray, - rescale_factor: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.ndarray: - """ - Rescale the image by the given factor. image = image * rescale_factor. - - Args: - image (`np.ndarray`): - Image to rescale. - rescale_factor (`float`): - The value to use for rescaling. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the input image. If unset, is inferred from the input image. Can be - one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation - def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: - """ - Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to - `[center_x, center_y, width, height]` format. - """ - return normalize_annotation(annotation, image_size=image_size) - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image - def _pad_image( - self, - image: np.ndarray, - output_size: Tuple[int, int], - constant_values: Union[float, Iterable[float]] = 0, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.ndarray: - """ - Pad an image with zeros to the given size. - """ - input_height, input_width = get_image_size(image, channel_dim=input_data_format) - output_height, output_width = output_size - - pad_bottom = output_height - input_height - pad_right = output_width - input_width - padding = ((0, pad_bottom), (0, pad_right)) - padded_image = pad( - image, - padding, - mode=PaddingMode.CONSTANT, - constant_values=constant_values, - data_format=data_format, - input_data_format=input_data_format, - ) - return padded_image - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad - def pad( - self, - images: List[np.ndarray], - constant_values: Union[float, Iterable[float]] = 0, - return_pixel_mask: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> BatchFeature: - """ - Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width - in the batch and optionally returns their corresponding pixel mask. - - Args: - image (`np.ndarray`): - Image to pad. - constant_values (`float` or `Iterable[float]`, *optional*): - The value to use for the padding if `mode` is `"constant"`. - return_pixel_mask (`bool`, *optional*, defaults to `True`): - Whether to return a pixel mask. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) - - padded_images = [ - self._pad_image( - image, - pad_size, - constant_values=constant_values, - data_format=data_format, - input_data_format=input_data_format, - ) - for image in images - ] - data = {"pixel_values": padded_images} - - if return_pixel_mask: - masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) - for image in images - ] - data["pixel_mask"] = masks - - return BatchFeature(data=data, tensor_type=return_tensors) - - def preprocess( - self, - images: ImageInput, - annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - resample=None, # PILImageResampling - do_rescale: Optional[bool] = None, - rescale_factor: Optional[Union[int, float]] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_pad: Optional[bool] = None, - return_tensors: Optional[Union[TensorType, str]] = None, - data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> BatchFeature: - """ - Preprocess an image or a batch of images so that it can be used by the model. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. - annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - do_resize (`bool`, *optional*, defaults to self.do_resize): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. - resample (`PILImageResampling`, *optional*, defaults to self.resample): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to self.do_rescale): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to self.rescale_factor): - Rescale factor to use when rescaling the image. - do_normalize (`bool`, *optional*, defaults to self.do_normalize): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): - Mean to use when normalizing the image. - image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): - Standard deviation to use when normalizing the image. - do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. - return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): - Type of tensors to return. If `None`, will return the list of images. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - max_size = None - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") - - do_resize = self.do_resize if do_resize is None else do_resize - size = self.size if size is None else size - size = get_size_dict(size=size, max_size=max_size, default_to_square=False) - resample = self.resample if resample is None else resample - do_rescale = self.do_rescale if do_rescale is None else do_rescale - rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor - do_normalize = self.do_normalize if do_normalize is None else do_normalize - image_mean = self.image_mean if image_mean is None else image_mean - image_std = self.image_std if image_std is None else image_std - do_pad = self.do_pad if do_pad is None else do_pad - - if do_resize is not None and size is None: - raise ValueError("Size and max_size must be specified if do_resize is True.") - - if do_rescale is not None and rescale_factor is None: - raise ValueError("Rescale factor must be specified if do_rescale is True.") - - if do_normalize is not None and (image_mean is None or image_std is None): - raise ValueError("Image mean and std must be specified if do_normalize is True.") - - images = make_list_of_images(images) - if annotations is not None and isinstance(annotations, dict): - annotations = [annotations] - - if annotations is not None and len(images) != len(annotations): - raise ValueError( - f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." - ) - - if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - if not valid_coco_detection_annotations(annotations): - raise ValueError( - "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts" - "(batch of images) with the following keys: `image_id` and `annotations`, with the latter " - "being a list of annotations in the COCO format." - ) - - # All transformations expect numpy arrays - images = [to_numpy_array(image) for image in images] - - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) - if annotations is not None: - prepared_images = [] - prepared_annotations = [] - for image, target in zip(images, annotations): - target = self.prepare_annotation( - image, - target, - input_data_format=input_data_format, - ) - prepared_images.append(image) - prepared_annotations.append(target) - images = prepared_images - annotations = prepared_annotations - del prepared_images, prepared_annotations - - # transformations - if do_resize: - if annotations is not None: - resized_images, resized_annotations = [], [] - for image, target in zip(images, annotations): - orig_size = get_image_size(image, input_data_format) - resized_image = self.resize( - image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format - ) - resized_annotation = self.resize_annotation( - target, orig_size, get_image_size(resized_image, input_data_format) - ) - resized_images.append(resized_image) - resized_annotations.append(resized_annotation) - images = resized_images - annotations = resized_annotations - del resized_images, resized_annotations - else: - images = [ - self.resize(image, size=size, resample=resample, input_data_format=input_data_format) - for image in images - ] - - if do_rescale: - images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] - - if do_normalize: - images = [ - self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images - ] - if annotations is not None: - annotations = [ - self.normalize_annotation(annotation, get_image_size(image, input_data_format)) - for annotation, image in zip(annotations, images) - ] - - if do_pad: - # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} - data = self.pad( - images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format - ) - else: - images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - for image in images - ] - data = {"pixel_values": images} - - encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) - if annotations is not None: - encoded_inputs["labels"] = [ - BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations - ] - - return encoded_inputs - - # POSTPROCESSING METHODS - TODO: add support for other frameworks - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`GroundingDINOForObjectDetection`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - - def post_process_object_detection( - self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100 - ): - """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`GroundingDINOForObjectDetection`]): - Raw outputs of the model. - threshold (`float`, *optional*): - Score threshold to keep object detection predictions. - target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. - top_k (`int`, *optional*, defaults to 100): - Keep only top k bounding boxes before filtering by thresholding. - - Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if target_sizes is not None: - if len(out_logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) - - prob = out_logits.sigmoid() - prob = prob.view(out_logits.shape[0], -1) - k_value = min(top_k, prob.size(1)) - topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) - - return results diff --git a/src/transformers/models/grounding_dino/tokenization_grounding_dino.py b/src/transformers/models/grounding_dino/tokenization_grounding_dino.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 From 98f38406a20244bc9179cea42357b3e54227c1e1 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:28:13 -0300 Subject: [PATCH 076/252] Fixed some issues with configuration --- .../configuration_grounding_dino.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index fbd0d483b48e45..e900714852fbaa 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Grounding DINO model configuration""" +import os +from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -25,7 +27,7 @@ "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } -# Copied from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet +# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet class GroundingDINOTextPrenetConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a @@ -134,6 +136,24 @@ def __init__( self.use_cache = use_cache self.classifier_dropout = classifier_dropout + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the text config dict if we are loading from CLIPSegConfig + if config_dict.get("model_type") == "grounding-dino": + config_dict = config_dict["text_backbone_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + class GroundingDINOConfig(PretrainedConfig): r""" @@ -289,7 +309,6 @@ def __init__( text_backbone_config=None, num_channels=3, num_queries=900, - max_position_embeddings=1024, encoder_layers=6, encoder_ffn_dim=2048, encoder_attention_heads=8, @@ -352,7 +371,6 @@ def __init__( self.backbone_config = backbone_config self.num_channels = num_channels self.num_queries = num_queries - self.max_position_embeddings = max_position_embeddings self.d_model = d_model self.encoder_ffn_dim = encoder_ffn_dim self.encoder_layers = encoder_layers @@ -391,7 +409,7 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else text_backbone_config + self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config) self.max_text_len = max_text_len # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout From 703eeff584416cc71a8592a3146c21d78acd11e4 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:32:22 -0300 Subject: [PATCH 077/252] Just some modifications on conversion script --- .../convert_grounding_dino_to_hf.py | 89 ++++--------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index ed16da3f0c4617..680c3872bf68dc 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -159,8 +159,8 @@ def create_rename_keys(state_dict, config): 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', } fusion_key_mappings = { - 'gamma_v': 'fusion_layer.gamma_v', - 'gamma_l': 'fusion_layer.gamma_l', + 'gamma_v': 'fusion_layer.vision_param', + 'gamma_l': 'fusion_layer.text_param', 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', @@ -326,66 +326,11 @@ def preprocess_caption(caption: str) -> str: return result return result + "." - def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list) -> list: - """Generate attention mask between each pair of special tokens - Args: - input_ids (torch.Tensor): input ids. Shape: [bs, num_token] - special_tokens_mask (list): special tokens mask. - Returns: - torch.Tensor: attention mask between each special tokens. - """ - input_ids = tokenized["input_ids"] - bs, num_token = input_ids.shape - # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens - special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() - for special_token in special_tokens_list: - special_tokens_mask |= input_ids == special_token - - # idxs: each row is a list of indices of special tokens - idxs = torch.nonzero(special_tokens_mask) - - # generate attention mask and positional ids - attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) - position_ids = torch.zeros((bs, num_token), device=input_ids.device) - previous_col = 0 - for i in range(idxs.shape[0]): - row, col = idxs[i] - if (col == 0) or (col == num_token - 1): - attention_mask[row, col, col] = True - position_ids[row, col] = 0 - else: - attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True - position_ids[row, previous_col + 1 : col + 1] = torch.arange( - 0, col - previous_col, device=input_ids.device - ) - - previous_col = col - - return attention_mask, position_ids.to(torch.long) - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer - special_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") - text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map( - tokenized, special_tokens - ) - - max_text_len = config.max_text_len - if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] - position_ids = position_ids[:, :max_text_len] - tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] - - # extract text embeddings - tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} - tokenized_for_encoder["attention_mask"] = text_self_attention_masks - tokenized_for_encoder["position_ids"] = position_ids - - return tokenized_for_encoder, tokenized.attention_mask.bool() + return tokenized @torch.no_grad() def convert_grounding_dino_checkpoint(args): @@ -415,7 +360,8 @@ def convert_grounding_dino_checkpoint(args): read_in_q_k_v(new_state_dict, config) # Load HF implementation with default config and converted state dict - model = GroundingDINOForObjectDetection(config).eval() + model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval() + # model = GroundingDINOForObjectDetection(config=config).eval() model.load_state_dict(new_state_dict, strict=False) # Load and process test image @@ -425,19 +371,24 @@ def convert_grounding_dino_checkpoint(args): [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)] ) image_inputs = image_processor(image) - text_inputs, text_token_mask = text_processor(text, config) + text_inputs = text_processor(text, config) # Running forward output = model( pixel_values=image_inputs.unsqueeze(0), - input_ids=text_inputs["input_ids"], - attention_mask=text_inputs["attention_mask"], - token_type_ids=text_inputs["token_type_ids"], - text_token_mask=text_token_mask, - text_self_attention_masks=text_inputs["attention_mask"], - position_ids=text_inputs["position_ids"], + **text_inputs ) + # output.pred_boxes[:, :3, :] + # tensor([[[0.7674, 0.4136, 0.4572, 0.7305], + # [0.2566, 0.5463, 0.4760, 0.8777], + # [0.2585, 0.5442, 0.4640, 0.8683]]]) + # + # output.logits[:, :3, :4] + # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374], + # [-4.9652, -0.3719, -0.3950, -4.2315], + # [-5.9599, -3.3765, -3.3104, -5.9752]]]) + if pytorch_dump_folder_path is not None: print(f"Saving model {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) @@ -448,10 +399,6 @@ def convert_grounding_dino_checkpoint(args): if push_to_hub: print(f"Pushing model and image processor for {model_name} to hub") model.push_to_hub(f"EduardoPacheco/{model_name}") - #TODO push image processor to hub - # image_processor.push_to_hub(f"microsoft/{model_name}") - #TODO push tokenizer to hub - #TODO push processor to hub if __name__ == "__main__": @@ -459,7 +406,7 @@ def convert_grounding_dino_checkpoint(args): # Required parameters parser.add_argument( "--model_name", - default="grounding-dino-base", + default="grounding-dino-tiny", type=str, choices=["grounding-dino-tiny", "grounding-dino-base"], help="Name of the GroundingDINO model you'd like to convert.", From c1c1467a80ea9bbbf689c5cc55221da6f0bdb51a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 13 Oct 2023 17:33:52 -0300 Subject: [PATCH 078/252] Other modifications --- src/transformers/__init__.py | 4 ++-- src/transformers/models/grounding_dino/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 309ce05c8345e9..6ceff48c7c5cbc 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -358,7 +358,7 @@ "GPTSanJapaneseTokenizer", ], "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], - "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], + "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroupViTConfig", @@ -4512,7 +4512,7 @@ GPTSanJapaneseTokenizer, ) from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig - from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GroupViTConfig, diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index e3767e017d1023..df2b0d907f1b65 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -18,7 +18,7 @@ _import_structure = { - "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig"], + "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], } try: @@ -36,7 +36,7 @@ if TYPE_CHECKING: - from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig + from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig try: if not is_torch_available(): From bfb8829f1f7e205760ede9f621b708a3f0e07943 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 16:36:51 +0200 Subject: [PATCH 079/252] Fix style --- src/transformers/__init__.py | 12 ++++++++++-- src/transformers/models/grounding_dino/__init__.py | 12 ++++++++++-- .../grounding_dino/configuration_grounding_dino.py | 9 +++++++-- .../grounding_dino/convert_grounding_dino_to_hf.py | 11 +++++------ .../models/grounding_dino/modeling_grounding_dino.py | 8 ++++++-- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6ceff48c7c5cbc..aaab9c8fff2c21 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -358,7 +358,11 @@ "GPTSanJapaneseTokenizer", ], "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], - "models.grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], + "models.grounding_dino": [ + "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GroundingDINOConfig", + "GroundingDINOTextPrenetConfig", + ], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroupViTConfig", @@ -4512,7 +4516,11 @@ GPTSanJapaneseTokenizer, ) from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig - from .models.grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, + GroundingDINOConfig, + GroundingDINOTextPrenetConfig, + ) from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GroupViTConfig, diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index df2b0d907f1b65..8ed227086ac3ae 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -18,7 +18,11 @@ _import_structure = { - "configuration_grounding_dino": ["GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOTextPrenetConfig"], + "configuration_grounding_dino": [ + "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GroundingDINOConfig", + "GroundingDINOTextPrenetConfig", + ], } try: @@ -36,7 +40,11 @@ if TYPE_CHECKING: - from .configuration_grounding_dino import GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOTextPrenetConfig + from .configuration_grounding_dino import ( + GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, + GroundingDINOConfig, + GroundingDINOTextPrenetConfig, + ) try: if not is_torch_available(): diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index e900714852fbaa..0de76985e82338 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -18,7 +18,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING, AutoConfig +from ..auto import CONFIG_MAPPING logger = logging.get_logger(__name__) @@ -27,6 +27,7 @@ "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", } + # Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet class GroundingDINOTextPrenetConfig(PretrainedConfig): r""" @@ -409,7 +410,11 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = GroundingDINOTextPrenetConfig() if text_backbone_config is None else GroundingDINOTextPrenetConfig(**text_backbone_config) + self.text_backbone_config = ( + GroundingDINOTextPrenetConfig() + if text_backbone_config is None + else GroundingDINOTextPrenetConfig(**text_backbone_config) + ) self.max_text_len = max_text_len # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 680c3872bf68dc..d58bebd09490cc 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -326,15 +326,17 @@ def preprocess_caption(caption: str) -> str: return result return result + "." - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Using just for now since I didn't finish the tokenizer + tokenizer = AutoTokenizer.from_pretrained( + "bert-base-uncased" + ) # Using just for now since I didn't finish the tokenizer text = preprocess_caption(text) tokenized = tokenizer([text], padding="longest", return_tensors="pt") return tokenized + @torch.no_grad() def convert_grounding_dino_checkpoint(args): - model_name = args.model_name pytorch_dump_folder_path = args.pytorch_dump_folder_path push_to_hub = args.push_to_hub @@ -374,10 +376,7 @@ def convert_grounding_dino_checkpoint(args): text_inputs = text_processor(text, config) # Running forward - output = model( - pixel_values=image_inputs.unsqueeze(0), - **text_inputs - ) + model(pixel_values=image_inputs.unsqueeze(0), **text_inputs) # output.pred_boxes[:, :3, :] # tensor([[[0.7674, 0.4136, 0.4572, 0.7305], diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 71e7cb33fba0b9..104ef8c3d20e92 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1932,7 +1932,7 @@ def custom_forward(*inputs): text_encoder_hidden_states, text_encoder_attention_mask, self_attn_mask, - None + None, ) else: layer_outputs = decoder_layer( @@ -2012,7 +2012,10 @@ def custom_forward(*inputs): text_cross_attentions=all_cross_attns_text, ) + SPECIAL_TOKENS = [101, 102, 1012, 1029] + + def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: """Generate attention mask between each pair of special tokens and positional ids. Args: @@ -2255,7 +2258,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) - text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere max_text_len = self.config.max_text_len if text_self_attention_masks.shape[1] > max_text_len: @@ -3654,6 +3657,7 @@ class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ + config_class = GroundingDINOTextPrenetConfig def __init__(self, config, add_pooling_layer=True): From 587589ee8fa07e4b36d85aaa312b1210ec655935 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 16:58:06 +0200 Subject: [PATCH 080/252] Improve fixup --- README.md | 2 +- README_es.md | 2 +- README_hd.md | 2 +- README_ja.md | 2 +- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- docs/source/en/index.md | 1 + docs/source/en/model_doc/grounding-dino.md | 12 +-- .../models/auto/image_processing_auto.py | 2 +- .../configuration_grounding_dino.py | 86 +++++++------------ .../test_modeling_grounding_dino.py | 5 ++ 12 files changed, 50 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 3311a4785b54d7..4774e04faedecb 100644 --- a/README.md +++ b/README.md @@ -375,7 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_es.md b/README_es.md index e5497cdd9cd8f6..3d1f6cc0099906 100644 --- a/README_es.md +++ b/README_es.md @@ -350,7 +350,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_hd.md b/README_hd.md index 7e85a8c53d1713..381792c11f76da 100644 --- a/README_hd.md +++ b/README_hd.md @@ -322,7 +322,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others से) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. द्वाराअनुसंधान पत्र [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) के साथ जारी किया गया 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। diff --git a/README_ja.md b/README_ja.md index 8f347bdd79264e..d2c660fd257734 100644 --- a/README_ja.md +++ b/README_ja.md @@ -384,7 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました. 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others から) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. から公開された研究論文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) diff --git a/README_ko.md b/README_ko.md index 31418f42b8a9ff..aa3c524f25f075 100644 --- a/README_ko.md +++ b/README_ko.md @@ -299,7 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others 에서 제공)은 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.의 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 107ed00f3de87f..9bb392e266b57f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -323,7 +323,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (来自 Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) 伴随论文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 由 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang 发布。 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index a633740b292821..e0878fc3bc774c 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -335,7 +335,7 @@ conda install -c huggingface transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. -1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from ) released with the paper []() by . +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index a1fbc63c7cc4e0..41627a7a81392f 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -148,6 +148,7 @@ Flax), PyTorch, and/or TensorFlow. | [GPTBigCode](model_doc/gpt_bigcode) | ✅ | ❌ | ❌ | | [GPTSAN-japanese](model_doc/gptsan-japanese) | ✅ | ❌ | ❌ | | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | +| [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index 161a90609174b3..05b5f84d698347 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -18,20 +18,22 @@ rendered properly in your Markdown viewer. ## Overview -The Grounding DINO model was proposed in []() by . - +The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot. The abstract from the paper is the following: -** +*In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP.* Tips: -This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). -The original code can be found [here](). +This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO). +## GroundingDINOTextPrenetConfig + +[[autodoc]] GroundingDINOTextPrenetConfig ## GroundingDINOConfig diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6399fe192616af..cf33369ef5492d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -66,7 +66,7 @@ ("focalnet", "BitImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), - ("grounding-dino", "GroundingDINOImageProcessor"), + ("grounding-dino", "DeformableDetrImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0de76985e82338..8ba34f727243b0 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -65,23 +65,19 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): type_vocab_size (`int`, *optional*, defaults to 2): The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`]. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + The index of the padding token in the token vocabulary. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). - is_decoder (`bool`, *optional*, defaults to `False`): - Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - classifier_dropout (`float`, *optional*): - The dropout ratio for the classification head. Examples: @@ -111,12 +107,10 @@ def __init__( attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, - initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", use_cache=True, - classifier_dropout=None, **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -131,11 +125,9 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache - self.classifier_dropout = classifier_dropout @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": @@ -167,55 +159,50 @@ class GroundingDINOConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - use_timm_backbone (`bool`, *optional*, defaults to `True`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] API. - backbone_config (`PretrainedConfig` or `dict`, *optional*): + backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `{'model_type': 'swin'}`): The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which case it will default to `ResNetConfig()`. text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`): The configuration of the text backbone model. Should be a bert-like config. num_channels (`int`, *optional*, defaults to 3): The number of input channels. - num_queries (`int`, *optional*, defaults to 300): + num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead. - d_model (`int`, *optional*, defaults to 256): - Dimension of the layers. encoder_layers (`int`, *optional*, defaults to 6): Number of encoder layers. - decoder_layers (`int`, *optional*, defaults to 6): - Number of decoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. encoder_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer encoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. decoder_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. - decoder_ffn_dim (`int`, *optional*, defaults to 1024): - Dimension of the "intermediate" (often named feed-forward) layer in decoder. - encoder_ffn_dim (`int`, *optional*, defaults to 1024): - Dimension of the "intermediate" (often named feed-forward) layer in decoder. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. activation_function (`str` or `function`, *optional*, defaults to `"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. dropout (`float`, *optional*, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. activation_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for activations inside the fully connected layer. - init_std (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - init_xavier_std (`float`, *optional*, defaults to 1): - The scaling factor used for the Xavier initialization gain in the HM Attention map module. - encoder_layerdrop (`float`, *optional*, defaults to 0.0): - The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) - for more details. auxiliary_loss (`bool`, *optional*, defaults to `False`): Whether auxiliary decoding losses (loss at each decoder layer) are to be used. position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. - backbone (`str`, *optional*, defaults to `"resnet50"`): + backbone (`str`, *optional*, defaults to `"swin"`): Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional backbone from the timm package. For a list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). @@ -224,36 +211,30 @@ class GroundingDINOConfig(PretrainedConfig): dilation (`bool`, *optional*, defaults to `False`): Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. - class_cost (`float`, *optional*, defaults to 1): - Relative weight of the classification error in the Hungarian matching cost. - bbox_cost (`float`, *optional*, defaults to 5): - Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. - giou_cost (`float`, *optional*, defaults to 2): - Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. - mask_loss_coefficient (`float`, *optional*, defaults to 1): - Relative weight of the Focal loss in the panoptic segmentation loss. - dice_loss_coefficient (`float`, *optional*, defaults to 1): - Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. - bbox_loss_coefficient (`float`, *optional*, defaults to 5): - Relative weight of the L1 bounding box loss in the object detection loss. - giou_loss_coefficient (`float`, *optional*, defaults to 2): - Relative weight of the generalized IoU loss in the object detection loss. - eos_coefficient (`float`, *optional*, defaults to 0.1): - Relative classification weight of the 'no-object' class in the object detection loss. num_feature_levels (`int`, *optional*, defaults to 4): The number of input feature levels. encoder_n_points (`int`, *optional*, defaults to 4): The number of sampled keys in each feature level for each attention head in the encoder. decoder_n_points (`int`, *optional*, defaults to 4): The number of sampled keys in each feature level for each attention head in the decoder. - two_stage (`bool`, *optional*, defaults to `False`): + two_stage (`bool`, *optional*, defaults to `True`): Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. - two_stage_num_proposals (`int`, *optional*, defaults to 300): + two_stage_num_proposals (`int`, *optional*, defaults to 900): The number of region proposals to be generated, in case `two_stage` is set to `True`. - with_box_refine (`bool`, *optional*, defaults to `False`): + with_box_refine (`bool`, *optional*, defaults to `True`): Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes based on the predictions from the previous layer. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. focal_alpha (`float`, *optional*, defaults to 0.25): Alpha parameter in the focal loss. disable_custom_kernels (`bool`, *optional*, defaults to `False`): @@ -261,8 +242,6 @@ class GroundingDINOConfig(PretrainedConfig): kernels are not supported by PyTorch ONNX export. max_text_len (`int`, *optional*, defaults to 256): The maximum length of the text input. - sub_sentence_present (`bool`, *optional*, defaults to `True`): - Whether to use sub-sentence present in the text input. text_enhancer_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the text enhancer. fusion_droppath (`float`, *optional*, defaults to 0.1): @@ -322,7 +301,6 @@ def __init__( dropout=0.1, attention_dropout=0.0, activation_dropout=0.0, - return_intermediate=True, auxiliary_loss=False, position_embedding_type="sine", backbone="swin", @@ -337,11 +315,8 @@ def __init__( class_cost=1, bbox_cost=5, giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, bbox_loss_coefficient=5, giou_loss_coefficient=2, - eos_coefficient=0.1, focal_alpha=0.25, disable_custom_kernels=False, # other parameters @@ -402,11 +377,8 @@ def __init__( self.bbox_cost = bbox_cost self.giou_cost = giou_cost # Loss coefficients - self.mask_loss_coefficient = mask_loss_coefficient - self.dice_loss_coefficient = dice_loss_coefficient self.bbox_loss_coefficient = bbox_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 3007eef6399916..b4c35ba7bda906 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -193,6 +193,11 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe test_pruning = False test_head_masking = False test_missing_keys = False + pipeline_model_mapping = ( + {"feature-extraction": GroundingDINOModel, "object-detection": GroundingDINOForObjectDetection} + if is_torch_available() + else {} + ) # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): From f68361110aea1e6d657dfe2a7bf6a6114eeb9e17 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 17:12:14 +0200 Subject: [PATCH 081/252] Improve conversion script --- .../convert_grounding_dino_to_hf.py | 26 ++++++++----------- .../grounding_dino/modeling_grounding_dino.py | 2 +- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index d58bebd09490cc..1f5fbae366cd5b 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -14,8 +14,7 @@ # limitations under the License. """Convert GroundingDINO SimMIM checkpoints from the original repository. -URL: -https://github.com/microsoft/GroundingDINO-Transformer/blob/main/MODELHUB.md#simmim-pretrained-grounding_dino-v1-models""" +URL: https://github.com/IDEA-Research/GroundingDINO""" import argparse @@ -342,16 +341,16 @@ def convert_grounding_dino_checkpoint(args): push_to_hub = args.push_to_hub checkpoint_mapping = { - "grounding-dino-tiny": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_tiny_clean.pth", - "grounding-dino-base": "/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", + "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth", + "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth", } # Define default GroundingDINO configuation config = get_grounding_dino_config(model_name) - checkpoint_path = checkpoint_mapping[model_name] - # Load original checkpoint - original_state_dict = torch.load(checkpoint_path, map_location="cpu") + checkpoint_url = checkpoint_mapping[model_name] + original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] + original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} # Rename keys new_state_dict = original_state_dict.copy() @@ -362,9 +361,12 @@ def convert_grounding_dino_checkpoint(args): read_in_q_k_v(new_state_dict, config) # Load HF implementation with default config and converted state dict - model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").eval() + model = GroundingDINOForObjectDetection(config) + model.eval() # model = GroundingDINOForObjectDetection(config=config).eval() - model.load_state_dict(new_state_dict, strict=False) + missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) # Load and process test image image = prepare_img() @@ -410,12 +412,6 @@ def convert_grounding_dino_checkpoint(args): choices=["grounding-dino-tiny", "grounding-dino-base"], help="Name of the GroundingDINO model you'd like to convert.", ) - # parser.add_argument( - # "--checkpoint_path", - # default="/home/eduardo/Desktop/Projects/GroundingDINO/weights/grounding_dino_base_clean.pth", - # type=str, - # help="Path to the original PyTorch checkpoint (.pth file).", - # ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 104ef8c3d20e92..b4e99fa6a776fc 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 3a0c7420ec1397dd4ea7da6c0eb71d01f316c832 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 19:29:42 +0200 Subject: [PATCH 082/252] Improve conversion script --- .../convert_grounding_dino_to_hf.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 1f5fbae366cd5b..fa0455ba94eb39 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -360,10 +360,9 @@ def convert_grounding_dino_checkpoint(args): rename_key(new_state_dict, src, dest) read_in_q_k_v(new_state_dict, config) - # Load HF implementation with default config and converted state dict + # Load HF model model = GroundingDINOForObjectDetection(config) model.eval() - # model = GroundingDINOForObjectDetection(config=config).eval() missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) print("Missing keys:", missing_keys) print("Unexpected keys:", unexpected_keys) @@ -378,23 +377,23 @@ def convert_grounding_dino_checkpoint(args): text_inputs = text_processor(text, config) # Running forward - model(pixel_values=image_inputs.unsqueeze(0), **text_inputs) - - # output.pred_boxes[:, :3, :] - # tensor([[[0.7674, 0.4136, 0.4572, 0.7305], - # [0.2566, 0.5463, 0.4760, 0.8777], - # [0.2585, 0.5442, 0.4640, 0.8683]]]) - # - # output.logits[:, :3, :4] - # tensor([[[-4.8913, -0.1900, -0.2161, -4.2374], - # [-4.9652, -0.3719, -0.3950, -4.2315], - # [-5.9599, -3.3765, -3.3104, -5.9752]]]) + with torch.no_grad(): + outputs = model(pixel_values=image_inputs.unsqueeze(0), **text_inputs) + + print("First values of logits:", outputs.logits[0, :3, :3]) + print("First values of boxes:", outputs.pred_boxes[0, :3, :3]) + + # verify outputs + expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]) + expected_logits = torch.tensor( + [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] + ) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) + assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4) if pytorch_dump_folder_path is not None: - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) - - print(f"Saving image processor to {pytorch_dump_folder_path}") image_processor.save_pretrained(pytorch_dump_folder_path) if push_to_hub: From 6115547d4fcd1e4f532f503f6847b709f31507ab Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 21:04:07 +0200 Subject: [PATCH 083/252] Add GroundingDINOProcessor --- docs/source/en/model_doc/grounding-dino.md | 4 + src/transformers/__init__.py | 2 + .../models/grounding_dino/__init__.py | 2 + .../convert_grounding_dino_to_hf.py | 45 ++++-- .../processing_grounding_dino.py | 151 ++++++++++++++++++ 5 files changed, 189 insertions(+), 15 deletions(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index 05b5f84d698347..03c3549c32cb5f 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -31,6 +31,10 @@ Tips: This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO). +## GroundingDINOProcessor + +[[autodoc]] GroundingDINOProcessor + ## GroundingDINOTextPrenetConfig [[autodoc]] GroundingDINOTextPrenetConfig diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index aaab9c8fff2c21..c73345163a37b9 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -361,6 +361,7 @@ "models.grounding_dino": [ "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", + "GroundingDINOProcessor", "GroundingDINOTextPrenetConfig", ], "models.groupvit": [ @@ -4519,6 +4520,7 @@ from .models.grounding_dino import ( GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, + GroundingDINOProcessor, GroundingDINOTextPrenetConfig, ) from .models.groupvit import ( diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index 8ed227086ac3ae..229666382564b8 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -23,6 +23,7 @@ "GroundingDINOConfig", "GroundingDINOTextPrenetConfig", ], + "processing_grounding_dino": ["GroundingDINOProcessor"], } try: @@ -45,6 +46,7 @@ GroundingDINOConfig, GroundingDINOTextPrenetConfig, ) + from .processing_grounding_dino import GroundingDINOProcessor try: if not is_torch_available(): diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index fa0455ba94eb39..042771e2a37280 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -23,7 +23,13 @@ from PIL import Image from torchvision import transforms as T -from transformers import AutoTokenizer, GroundingDINOConfig, GroundingDINOForObjectDetection +from transformers import ( + AutoTokenizer, + DeformableDetrImageProcessor, + GroundingDINOConfig, + GroundingDINOForObjectDetection, + GroundingDINOProcessor, +) IMAGENET_MEAN = [0.485, 0.456, 0.406] @@ -318,20 +324,21 @@ def prepare_img(): return image -def text_processor(text: str, config): - def preprocess_caption(caption: str) -> str: - result = caption.lower().strip() - if result.endswith("."): - return result - return result + "." +def preprocess_caption(caption: str) -> str: + result = caption.lower().strip() + if result.endswith("."): + return result + return result + "." + +def text_processor(text: str): tokenizer = AutoTokenizer.from_pretrained( "bert-base-uncased" ) # Using just for now since I didn't finish the tokenizer text = preprocess_caption(text) - tokenized = tokenizer([text], padding="longest", return_tensors="pt") + original_text_inputs = tokenizer([text], padding="longest", return_tensors="pt") - return tokenized + return original_text_inputs @torch.no_grad() @@ -369,16 +376,23 @@ def convert_grounding_dino_checkpoint(args): # Load and process test image image = prepare_img() + transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) + original_pixel_values = transforms(image).unsqueeze(0) text = "a cat" - image_processor = T.Compose( - [T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)] - ) - image_inputs = image_processor(image) - text_inputs = text_processor(text, config) + text_inputs = text_processor(text) + + image_processor = DeformableDetrImageProcessor() + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer) + + inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") + + assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) + assert torch.allclose(text_inputs["input_ids"], inputs.input_ids, atol=1e-4) # Running forward with torch.no_grad(): - outputs = model(pixel_values=image_inputs.unsqueeze(0), **text_inputs) + outputs = model(pixel_values=original_pixel_values, **text_inputs) print("First values of logits:", outputs.logits[0, :3, :3]) print("First values of boxes:", outputs.pred_boxes[0, :3, :3]) @@ -390,6 +404,7 @@ def convert_grounding_dino_checkpoint(args): ) assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4) + print("Looks ok!") if pytorch_dump_folder_path is not None: print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}") diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index e69de29bb2d1d6..5bc1feaa2d510c 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -0,0 +1,151 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Grounding DINO. +""" + +from typing import List, Optional, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType + + +class GroundingDINOProcessor(ProcessorMixin): + r""" + Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a + single processor. + + [`GroundingDINOProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and + [`AutoTokenizer`]. See the docstring of [`~GroundingDINOProcessor.__call__`] and [`~GroundingDINOProcessor.decode`] + for more information. + + Args: + image_processor (`DeformableDetrImageProcessor`): + An instance of [`DeformableDetrImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + """ + attributes = ["image_processor", "tokenizer"] + image_processor_class = "DeformableDetrImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_token_type_ids: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchEncoding: + """ + This method uses [`DeformableDetrImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + # Get only text + if images is None: + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + return text_encoding + + # add pixel_values + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + + if text is not None: + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + else: + text_encoding = None + + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + + return encoding_image_processor + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) From cc1788f351f3eb0c47643c217f5e55ac8adc1dc7 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 21:47:32 +0200 Subject: [PATCH 084/252] More improvements --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 042771e2a37280..e5505b50297186 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -407,13 +407,14 @@ def convert_grounding_dino_checkpoint(args): print("Looks ok!") if pytorch_dump_folder_path is not None: - print(f"Saving model and image processor for {model_name} to {pytorch_dump_folder_path}") + print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) if push_to_hub: - print(f"Pushing model and image processor for {model_name} to hub") + print(f"Pushing model and processor for {model_name} to hub") model.push_to_hub(f"EduardoPacheco/{model_name}") + processor.push_to_hub(f"EduardoPacheco/{model_name}") if __name__ == "__main__": From a6dea4ada4796b2716bdfcf0b32cb1df97c5a959 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Oct 2023 21:55:01 +0200 Subject: [PATCH 085/252] Return token type ids --- .../convert_grounding_dino_to_hf.py | 16 ++-------------- .../grounding_dino/processing_grounding_dino.py | 2 +- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index e5505b50297186..a0d0d205454217 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -331,16 +331,6 @@ def preprocess_caption(caption: str) -> str: return result + "." -def text_processor(text: str): - tokenizer = AutoTokenizer.from_pretrained( - "bert-base-uncased" - ) # Using just for now since I didn't finish the tokenizer - text = preprocess_caption(text) - original_text_inputs = tokenizer([text], padding="longest", return_tensors="pt") - - return original_text_inputs - - @torch.no_grad() def convert_grounding_dino_checkpoint(args): model_name = args.model_name @@ -378,21 +368,19 @@ def convert_grounding_dino_checkpoint(args): image = prepare_img() transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) original_pixel_values = transforms(image).unsqueeze(0) - text = "a cat" - text_inputs = text_processor(text) image_processor = DeformableDetrImageProcessor() tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer) + text = "a cat" inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) - assert torch.allclose(text_inputs["input_ids"], inputs.input_ids, atol=1e-4) # Running forward with torch.no_grad(): - outputs = model(pixel_values=original_pixel_values, **text_inputs) + outputs = model(**inputs) print("First values of logits:", outputs.logits[0, :3, :3]) print("First values of boxes:", outputs.pred_boxes[0, :3, :3]) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 5bc1feaa2d510c..10fd6e9834a9c3 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -60,7 +60,7 @@ def __call__( return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, - return_token_type_ids: bool = False, + return_token_type_ids: bool = True, return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, From ae6e110142c1b679d8ed58f4066613562948980a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 14 Oct 2023 16:58:23 -0300 Subject: [PATCH 086/252] something --- .../models/auto/image_processing_auto.py | 2 +- .../models/auto/tokenization_auto.py | 1 + .../test_modeling_grounding_dino.py | 109 +++++++++++------- 3 files changed, 71 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index d6d722b3e0842b..7962ccc11c5ba7 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -66,7 +66,7 @@ ("focalnet", "BitImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), - ("grounding-dino", "GroundingDINOImageProcessor"), + ("grounding-dino", "DeformableDetrImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index a5b167183ce913..5b4f69490d6146 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -165,6 +165,7 @@ ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)), + ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), ("hubert", ("Wav2Vec2CTCTokenizer", None)), diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 3007eef6399916..4058ab073fda2b 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -20,7 +20,7 @@ import unittest from typing import Dict, List, Tuple -from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available from transformers.file_utils import cached_property from transformers.testing_utils import ( require_timm, @@ -53,59 +53,57 @@ class GroundingDINOModelTester: def __init__( self, parent, + image_size=196, batch_size=8, - is_training=True, - use_labels=True, + is_training=False, + use_labels=False, hidden_size=32, num_hidden_layers=2, num_attention_heads=8, - intermediate_size=4, - hidden_act="gelu", + hidden_act="relu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, num_queries=12, num_channels=3, - image_size=196, n_targets=8, - num_labels=91, num_feature_levels=4, - encoder_n_points=2, - decoder_n_points=6, + intermediate_size=32 ): self.parent = parent self.batch_size = batch_size self.is_training = is_training self.use_labels = use_labels + self.image_size = image_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.num_queries = num_queries self.num_channels = num_channels - self.image_size = image_size self.n_targets = n_targets - self.num_labels = num_labels self.num_feature_levels = num_feature_levels - self.encoder_n_points = encoder_n_points - self.decoder_n_points = decoder_n_points - - # we also set the expected seq length for both encoder and decoder - self.encoder_seq_length = ( - math.ceil(self.image_size / 8) ** 2 - + math.ceil(self.image_size / 16) ** 2 - + math.ceil(self.image_size / 32) ** 2 - + math.ceil(self.image_size / 64) ** 2 - ) - self.decoder_seq_length = self.num_queries + self.intermediate_size = intermediate_size + def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) - pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + input_ids = torch.Tensor([[101, 1037, 4937, 1012, 102]]).long() + text_token_mask = torch.ones_like(input_ids).bool() + text_self_attention_masks = torch.Tensor([ + [[ True, False, False, False, False], + [False, True, True, True, False], + [False, True, True, True, False], + [False, True, True, True, False], + [False, False, False, False, True]] + ] + ).bool() + token_type_ids = torch.zeros_like(input_ids).long() + position_ids = torch.Tensor([[0, 0, 1, 2, 0]]).long() + labels = None if self.use_labels: # labels is a list of Dict (each Dict being the labels for a given example in the batch) @@ -120,16 +118,16 @@ def prepare_config_and_inputs(self): labels.append(target) config = self.get_config() - return config, pixel_values, pixel_mask, labels + return config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, text_self_attention_masks.copy(), token_type_ids, position_ids, labels def get_config(self): - resnet_config = ResNetConfig( + swin_config = SwinConfig( num_channels=3, - embeddings_size=10, - hidden_sizes=[10, 20, 30, 40], + hidden_size=128, + embed_dim=96, + image_size=self.image_size, + window_size=7, depths=[1, 1, 2, 1], - hidden_act="relu", - num_labels=3, out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) @@ -149,36 +147,67 @@ def get_config(self): encoder_n_points=self.encoder_n_points, decoder_n_points=self.decoder_n_points, use_timm_backbone=False, - backbone_config=resnet_config, + backbone_config=swin_config, ) def prepare_config_and_inputs_for_common(self): - config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() - inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, + "input_ids": input_ids, "text_token_mask": text_token_mask, + "text_self_attention_masks": text_self_attention_masks, "token_type_ids": token_type_ids, + "position_ids": position_ids, "attention_mask": attention_mask + } return config, inputs_dict - def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels): + def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels): model = GroundingDINOModel(config=config) model.to(torch_device) model.eval() - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) - result = model(pixel_values) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, + input_ids=input_ids, text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + attention_mask=attention_mask, token_type_ids=token_type_ids, + position_ids=position_ids + ) + + result = model(pixel_values=pixel_values, + input_ids=input_ids, text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + attention_mask=attention_mask, token_type_ids=token_type_ids, + position_ids=position_ids + ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) - def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, text_token_mask, text_self_attention_masks, attention_mask, token_type_ids, position_ids, labels): model = GroundingDINOForObjectDetection(config=config) model.to(torch_device) model.eval() - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) - result = model(pixel_values) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, + input_ids=input_ids, text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + attention_mask=attention_mask, token_type_ids=token_type_ids, + position_ids=position_ids + ) + result = model(pixel_values=pixel_values, + input_ids=input_ids, text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + attention_mask=attention_mask, token_type_ids=token_type_ids, + position_ids=position_ids + ) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, + input_ids=input_ids, text_token_mask=text_token_mask, + text_self_attention_masks=text_self_attention_masks, + attention_mask=attention_mask, token_type_ids=token_type_ids, + position_ids=position_ids, labels=labels + ) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) From 9fba8c2470afb14a33beeae4072072b7c445eb43 Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Oct 2023 19:36:12 +0200 Subject: [PATCH 087/252] Fix more tests --- .../grounding_dino/modeling_grounding_dino.py | 52 ++++++++++++------- .../test_modeling_grounding_dino.py | 31 +++++++---- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index b4e99fa6a776fc..368830354e8eb7 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -898,18 +898,24 @@ def forward( class GroundingDINOBiMultiHeadAttention(nn.Module): - def __init__(self, vision_dim: int, text_dim: int, embed_dim: int, num_heads: int, dropout: float = 0.1): + def __init__(self, config): super().__init__() + vision_dim = text_dim = config.d_model + embed_dim = config.encoder_ffn_dim // 2 + num_heads = config.encoder_attention_heads // 2 + dropout = config.fusion_dropout + self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads self.vision_dim = vision_dim self.text_dim = text_dim - assert ( - self.head_dim * self.num_heads == self.embed_dim - ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + ) self.scale = self.head_dim ** (-0.5) self.dropout = dropout @@ -958,8 +964,6 @@ def forward( Returns: _type_: _description_ """ - # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': - # import ipdb; ipdb.set_trace() bsz, tgt_len, _ = vision_features.size() vision_query_states = self.vision_proj(vision_features) * self.scale @@ -1097,13 +1101,7 @@ def __init__(self, config, init_values=1e-4): # pre layer norm self.layer_norm_vision = nn.LayerNorm(config.d_model) self.layer_norm_text = nn.LayerNorm(config.d_model) - self.attn = GroundingDINOBiMultiHeadAttention( - vision_dim=config.d_model, - text_dim=config.d_model, - embed_dim=config.encoder_ffn_dim // 2, - num_heads=config.encoder_attention_heads // 2, - dropout=config.fusion_dropout, - ) + self.attn = GroundingDINOBiMultiHeadAttention(config) # add layer scale for training stability self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() @@ -1241,6 +1239,9 @@ def sine_func(x: torch.Tensor): class GroundingDINOEncoderLayer(nn.Module): def __init__(self, config) -> None: super().__init__() + + self.d_model = config.d_model + self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) self.fusion_layer = GroundingDINOFusionLayer(config) self.deformable_layer = GroundingDINODeformableLayer(config) @@ -1248,15 +1249,21 @@ def __init__(self, config) -> None: def get_text_position_embeddings( self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor ) -> Tensor: - bs, n_text, text_dim = text_features.shape + batch_size, seq_length, _ = text_features.shape if text_position_embedding is None and text_position_ids is None: text_position_embedding = ( - torch.arange(n_text, device=text_features.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs, 1, 1) + torch.arange(seq_length, device=text_features.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(batch_size, 1, 1) + ) + text_position_embedding = get_sine_pos_embed( + text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False ) - text_position_embedding = get_sine_pos_embed(text_position_embedding, num_pos_feats=256, exchange_xy=False) if text_position_ids is not None: text_position_embedding = get_sine_pos_embed( - text_position_ids[..., None], num_pos_feats=256, exchange_xy=False + text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False ) return text_position_embedding @@ -2258,6 +2265,13 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere max_text_len = self.config.max_text_len @@ -2517,8 +2531,8 @@ def forward( self, pixel_values: torch.FloatTensor, input_ids: torch.LongTensor, - attention_mask: torch.LongTensor, - token_type_ids: torch.LongTensor, + attention_mask: torch.LongTensor = None, + token_type_ids: torch.LongTensor = None, pixel_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index b4c35ba7bda906..20f0a23fb42316 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -33,7 +33,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -59,7 +59,7 @@ def __init__( hidden_size=32, num_hidden_layers=2, num_attention_heads=8, - intermediate_size=4, + intermediate_size=8, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, @@ -71,6 +71,7 @@ def __init__( num_feature_levels=4, encoder_n_points=2, decoder_n_points=6, + max_text_len=256, ): self.parent = parent self.batch_size = batch_size @@ -91,6 +92,7 @@ def __init__( self.num_feature_levels = num_feature_levels self.encoder_n_points = encoder_n_points self.decoder_n_points = decoder_n_points + self.max_text_len = max_text_len # we also set the expected seq length for both encoder and decoder self.encoder_seq_length = ( @@ -103,9 +105,10 @@ def __init__( def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) - pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels) + labels = None if self.use_labels: # labels is a list of Dict (each Dict being the labels for a given example in the batch) @@ -120,7 +123,7 @@ def prepare_config_and_inputs(self): labels.append(target) config = self.get_config() - return config, pixel_values, pixel_mask, labels + return config, pixel_values, pixel_mask, input_ids, labels def get_config(self): resnet_config = ResNetConfig( @@ -150,35 +153,38 @@ def get_config(self): decoder_n_points=self.decoder_n_points, use_timm_backbone=False, backbone_config=resnet_config, + max_text_len=self.max_text_len, ) def prepare_config_and_inputs_for_common(self): - config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() - inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + config, pixel_values, pixel_mask, input_ids, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids} return config, inputs_dict - def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, labels): + def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, labels): model = GroundingDINOModel(config=config) model.to(torch_device) model.eval() - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) result = model(pixel_values) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) - def create_and_check_grounding_dino_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + def create_and_check_grounding_dino_object_detection_head_model( + self, config, pixel_values, pixel_mask, input_ids, labels + ): model = GroundingDINOForObjectDetection(config=config) model.to(torch_device) model.eval() - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) result = model(pixel_values) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) - result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) @@ -203,6 +209,9 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + for k, v in inputs_dict.items(): + print(k, v.shape) + if return_labels: if model_class.__name__ == "GroundingDINOForObjectDetection": labels = [] From 684a0bb05a57d3ad0d11821788a046068c1e8448 Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Oct 2023 19:45:44 +0200 Subject: [PATCH 088/252] More improvements --- .../configuration_grounding_dino.py | 21 ++++++++---------- .../grounding_dino/modeling_grounding_dino.py | 12 +++++----- .../test_modeling_grounding_dino.py | 22 ++++++------------- 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 8ba34f727243b0..3a1740ceebcf27 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -28,22 +28,20 @@ } -# Modified from transformers.models.bert.configuration_bert.BertConfig with Bert->GroundingDINOTextPrenet class GroundingDINOTextPrenetConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`] or a - [`TFGroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments, - defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration - to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to + instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BERT + [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - Args: vocab_size (`int`, *optional*, defaults to 30522): Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`] or [`TFGroundingDINOTextPrenetModel`]. + `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`]. hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -63,8 +61,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): - The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`] or - [`TFGroundingDINOTextPrenetModel`]. + The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`]. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): @@ -153,7 +150,7 @@ class GroundingDINOConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Grounding DINO - [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + [idea-research/grounding-dino-tiny](https://huggingface.co/idea-research/grounding-dino-tiny) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -267,10 +264,10 @@ class GroundingDINOConfig(PretrainedConfig): ```python >>> from transformers import GroundingDINOConfig, GroundingDINOModel - >>> # Initializing a Grounding DINO SenseTime/deformable-detr style configuration + >>> # Initializing a Grounding DINO idea-research/grounding-dino-tiny style configuration >>> configuration = GroundingDINOConfig() - >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> # Initializing a model (with random weights) from the idea-research/grounding-dino-tiny style configuration >>> model = GroundingDINOModel(configuration) >>> # Accessing the model configuration diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 368830354e8eb7..81e5e8ce22b97a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2226,8 +2226,8 @@ def forward( self, pixel_values: Tensor, input_ids: Tensor, - token_type_ids: Tensor, - attention_mask: Tensor, + token_type_ids: Tensor = None, + attention_mask: Tensor = None, pixel_mask: Optional[Tensor] = None, encoder_outputs=None, output_attentions=None, @@ -2247,8 +2247,8 @@ def forward( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") - >>> model = GroundingDINOModel.from_pretrained("SenseTime/deformable-detr") + >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny") + >>> model = GroundingDINOModel.from_pretrained("idea-research/grounding-dino-tiny") >>> inputs = image_processor(images=image, return_tensors="pt") @@ -2559,8 +2559,8 @@ def forward( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") - >>> model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr") + >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny") + >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny") >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 20f0a23fb42316..25b24786ab2054 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -209,9 +209,6 @@ class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) - for k, v in inputs_dict.items(): - print(k, v.shape) - if return_labels: if model_class.__name__ == "GroundingDINOForObjectDetection": labels = [] @@ -413,7 +410,6 @@ def recursive_check(tuple_object, dict_object): recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: - print("Model class:", model_class) model = model_class(config) model.to(torch_device) model.eval() @@ -494,17 +490,13 @@ def test_forward_signature(self): # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] - if model.config.is_encoder_decoder: - expected_arg_names = ["pixel_values", "pixel_mask"] - expected_arg_names.extend( - ["head_mask", "decoder_head_mask", "encoder_outputs"] - if "head_mask" and "decoder_head_mask" in arg_names - else [] - ) - self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - else: - expected_arg_names = ["pixel_values", "pixel_mask"] - self.assertListEqual(arg_names[:1], expected_arg_names) + expected_arg_names = ["pixel_values", "input_ids"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) def test_different_timm_backbone(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() From 3b2d57612dcb716889f338bc8bba6e4c11bea39b Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Oct 2023 19:53:11 +0200 Subject: [PATCH 089/252] More cleanup --- .../grounding_dino/modeling_grounding_dino.py | 70 ++----------------- 1 file changed, 4 insertions(+), 66 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 81e5e8ce22b97a..eff5899f0fb6e1 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -41,7 +41,7 @@ ) from ...modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, + BaseModelOutputWithPooling, ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer @@ -3660,18 +3660,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): - """ - - The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of - cross-attention is added between the self-attention layers, following the architecture described in [Attention is - all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, - Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set - to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and - `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. - """ - config_class = GroundingDINOTextPrenetConfig def __init__(self, config, add_pooling_layer=True): @@ -3708,45 +3696,16 @@ def forward( position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: - r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - """ + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if self.config.is_decoder: - use_cache = use_cache if use_cache is not None else self.config.use_cache - else: - use_cache = False - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -3760,11 +3719,8 @@ def forward( batch_size, seq_length = input_shape device = input_ids.device if input_ids is not None else inputs_embeds.device - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - if attention_mask is None: - attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + attention_mask = torch.ones(((batch_size, seq_length)), device=device) if token_type_ids is None: if hasattr(self.embeddings, "token_type_ids"): @@ -3778,17 +3734,6 @@ def forward( # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None - # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N @@ -3801,16 +3746,11 @@ def forward( position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, @@ -3821,11 +3761,9 @@ def forward( if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( + return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) From 88e5d0201d8c5fc06b2e1b13751847776d7ba431 Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Oct 2023 20:09:52 +0200 Subject: [PATCH 090/252] More improvements --- .../grounding_dino/convert_grounding_dino_to_hf.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 13 ++++++------- .../grounding_dino/test_modeling_grounding_dino.py | 14 ++++++-------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index a0d0d205454217..2ddfcf34b80615 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -252,7 +252,7 @@ def create_rename_keys(state_dict, config): rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE if "feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("feat_map", "model.input_proj_text"))) + rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection"))) #### DECODER REFERENCE POINT HEAD if "transformer.decoder.ref_point_head" in layer_name: rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index eff5899f0fb6e1..b160e2b252988b 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1511,8 +1511,8 @@ def _init_weights(self, module): if p.dim() > 1: nn.init.xavier_uniform_(p) elif isinstance(module, GroundingDINOModel): - nn.init.constant_(module.input_proj_text.bias.data, 0) - nn.init.xavier_uniform_(module.input_proj_text.weight.data) + nn.init.constant_(module.text_projection.bias.data, 0) + nn.init.xavier_uniform_(module.text_projection.weight.data) for proj in module.input_proj_vision: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) @@ -2108,7 +2108,7 @@ def __init__(self, config: GroundingDINOConfig): # Create text backbone self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config) - self.input_proj_text = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) + self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) @@ -2118,6 +2118,8 @@ def __init__(self, config: GroundingDINOConfig): self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + print("Two stage:", config.two_stage) + if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) @@ -2286,7 +2288,7 @@ def forward( text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[ "last_hidden_state" ] - text_features = self.input_proj_text(text_features) + text_features = self.text_projection(text_features) batch_size, num_channels, height, width = pixel_values.shape device = pixel_values.device @@ -3223,9 +3225,6 @@ def forward( return embeddings -# Classes for Text Backbone (It's just a BERT model) - - # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText class GroundingDINOTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 25b24786ab2054..59ebb0cdb6b2f0 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -161,7 +161,7 @@ def prepare_config_and_inputs_for_common(self): inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids} return config, inputs_dict - def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask, input_ids, labels): + def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels): model = GroundingDINOModel(config=config) model.to(torch_device) model.eval() @@ -171,9 +171,7 @@ def create_and_check_grounding_dino_model(self, config, pixel_values, pixel_mask self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) - def create_and_check_grounding_dino_object_detection_head_model( - self, config, pixel_values, pixel_mask, input_ids, labels - ): + def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels): model = GroundingDINOForObjectDetection(config=config) model.to(torch_device) model.eval() @@ -244,13 +242,13 @@ def test_config(self): self.config_tester.create_and_test_config_with_num_labels() self.config_tester.check_config_can_be_init_without_params() - def test_grounding_dino_model(self): + def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_grounding_dino_model(*config_and_inputs) + self.model_tester.create_and_check_model(*config_and_inputs) - def test_grounding_dino_object_detection_head_model(self): + def test_object_detection_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_grounding_dino_object_detection_head_model(*config_and_inputs) + self.model_tester.create_and_check_object_detection_head_model(*config_and_inputs) @unittest.skip(reason="Grounding DINO does not use inputs_embeds") def test_inputs_embeds(self): From 8bae1bd61ff9c1ff9cd8b2a220443437e8c8569f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 16 Oct 2023 17:00:11 -0300 Subject: [PATCH 091/252] Fixed tests, improved modeling and config --- .../configuration_grounding_dino.py | 12 +----- .../grounding_dino/modeling_grounding_dino.py | 39 +++++++++---------- .../test_modeling_grounding_dino.py | 19 ++++----- 3 files changed, 27 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 3a1740ceebcf27..6b1f6c1913e7e2 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -168,8 +168,7 @@ class GroundingDINOConfig(PretrainedConfig): The number of input channels. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects - [`GroundingDINOModel`] can detect in a single image. In case `two_stage` is set to `True`, we use - `two_stage_num_proposals` instead. + [`GroundingDINOModel`] can detect in a single image. encoder_layers (`int`, *optional*, defaults to 6): Number of encoder layers. encoder_ffn_dim (`int`, *optional*, defaults to 2048): @@ -217,8 +216,6 @@ class GroundingDINOConfig(PretrainedConfig): two_stage (`bool`, *optional*, defaults to `True`): Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. - two_stage_num_proposals (`int`, *optional*, defaults to 900): - The number of region proposals to be generated, in case `two_stage` is set to `True`. with_box_refine (`bool`, *optional*, defaults to `True`): Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes based on the predictions from the previous layer. @@ -254,9 +251,6 @@ class GroundingDINOConfig(PretrainedConfig): two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): Whether to share the bbox embedding between the two-stage bbox generator and the region proposal generation. - two_stage_class_embed_share (`bool`, *optional*, defaults to `False`): - Whether to share the class embedding between the two-stage bbox generator and the region proposal - generation. positional_embedding_temperature (`float`, *optional*, defaults to 20): The temperature for Sine Positional Embedding that is used together with vision backbone. Examples: @@ -307,7 +301,6 @@ def __init__( encoder_n_points=4, decoder_n_points=4, two_stage=True, - two_stage_num_proposals=900, with_box_refine=True, class_cost=1, bbox_cost=5, @@ -325,7 +318,6 @@ def __init__( query_dim=4, decoder_bbox_embed_share=True, two_stage_bbox_embed_share=False, - two_stage_class_embed_share=False, positional_embedding_temperature=20, **kwargs, ): @@ -365,7 +357,6 @@ def __init__( self.encoder_n_points = encoder_n_points self.decoder_n_points = decoder_n_points self.two_stage = two_stage - self.two_stage_num_proposals = two_stage_num_proposals self.with_box_refine = with_box_refine if two_stage is True and with_box_refine is False: raise ValueError("If two_stage is True, with_box_refine must be True.") @@ -397,7 +388,6 @@ def __init__( self.two_stage_bbox_embed_share = two_stage_bbox_embed_share if two_stage_bbox_embed_share and not decoder_bbox_embed_share: raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") - self.two_stage_class_embed_share = two_stage_class_embed_share self.positional_embedding_temperature = positional_embedding_temperature super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index b160e2b252988b..36c631c9a85d12 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -284,7 +284,7 @@ class GroundingDINOModelOutput(ModelOutput): sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the attention softmax, used to compute the weighted average in the bi-attention heads. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): - Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and background). enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): @@ -387,7 +387,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): Initial reference points sent through the Transformer decoder. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): - Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and background). enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): @@ -1521,6 +1521,9 @@ def _init_weights(self, module): nn.init.constant_(module.reference_points.bias.data, 0.0) if hasattr(module, "level_embed"): nn.init.normal_(module.level_embed) + if isinstance(module, GroundingDINOMLPPredictionHead): + nn.init.constant_(module.layers[-1].weight.data, 0) + nn.init.constant_(module.layers[-1].bias.data, 0) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GroundingDINODecoder): @@ -2123,8 +2126,14 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) - self.encoder_output_bbox_embed = None - self.encoder_output_class_embed = None + if config.two_stage_bbox_embed_share and config.decoder_bbox_embed_share and self.decoder.bbox_embed is not None: + self.encoder_output_bbox_embed = self.decoder.bbox_embed + else: + self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + self.encoder_output_class_embed = GroundingDINOContrastiveEmbedding(config) else: self.reference_points = nn.Embedding(config.num_queries, 4) @@ -2403,8 +2412,8 @@ def forward( delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) enc_outputs_coord_logits = delta_bbox + output_proposals - # only keep top scoring `config.two_stage_num_proposals` proposals - topk = self.config.two_stage_num_proposals + # only keep top scoring `config.num_queries` proposals + topk = self.config.num_queries topk_logits = enc_outputs_class.max(-1)[0] topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] topk_coords_logits = torch.gather( @@ -2492,9 +2501,6 @@ def __init__(self, config: GroundingDINOConfig): input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) - nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) - if config.decoder_bbox_embed_share: self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) else: @@ -2504,18 +2510,6 @@ def __init__(self, config: GroundingDINOConfig): self.model.decoder.bbox_embed = self.bbox_embed self.model.decoder.class_embed = self.class_embed - if config.two_stage: - if config.two_stage_bbox_embed_share: - self.model.encoder_output_bbox_embed = _bbox_embed - else: - self.model.encoder_output_bbox_embed = copy.deepcopy(_bbox_embed) - - # TODO don't believe this is necessary since class_embed has no parameters - if config.two_stage_class_embed_share: - self.model.encoder_output_class_embed = _class_embed - else: - self.model.encoder_output_class_embed = copy.deepcopy(_class_embed) - # Initialize weights and apply final processing self.post_init() @@ -2584,6 +2578,9 @@ def forward( ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs outputs = self.model( pixel_values=pixel_values, diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 59ebb0cdb6b2f0..fc398b0822594b 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -20,7 +20,7 @@ import unittest from typing import Dict, List, Tuple -from transformers import GroundingDINOConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available from transformers.file_utils import cached_property from transformers.testing_utils import ( require_timm, @@ -126,13 +126,12 @@ def prepare_config_and_inputs(self): return config, pixel_values, pixel_mask, input_ids, labels def get_config(self): - resnet_config = ResNetConfig( - num_channels=3, - embeddings_size=10, - hidden_sizes=[10, 20, 30, 40], - depths=[1, 1, 2, 1], - hidden_act="relu", - num_labels=3, + swin_config = SwinConfig( + window_size=7, + embed_dim=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], + image_size=self.image_size, out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) @@ -152,7 +151,7 @@ def get_config(self): encoder_n_points=self.encoder_n_points, decoder_n_points=self.decoder_n_points, use_timm_backbone=False, - backbone_config=resnet_config, + backbone_config=swin_config, max_text_len=self.max_text_len, ) @@ -167,7 +166,6 @@ def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, la model.eval() result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) - result = model(pixel_values) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) @@ -177,7 +175,6 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix model.eval() result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) - result = model(pixel_values) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) From f343f78f4af2f26c53b0d00036ece10cfac0cc46 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 17 Oct 2023 00:41:14 -0300 Subject: [PATCH 092/252] More improvements and fixing tests --- .../grounding_dino/modeling_grounding_dino.py | 200 +++++------------- .../test_modeling_grounding_dino.py | 32 +-- 2 files changed, 69 insertions(+), 163 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 36c631c9a85d12..9e657f168d3638 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -152,27 +152,17 @@ class GroundingDINODecoderOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the self-attention heads. - vision_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, - used to compute the weighted average in the cross-attention heads. - text_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights of the encoder's cross-attention layer, after the attention softmax, - used to compute the weighted average in the text cross-attention heads. + the self-attention, cross-attention and multi-scale deformable attention heads. """ last_hidden_state: torch.FloatTensor = None intermediate_hidden_states: torch.FloatTensor = None intermediate_reference_points: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - vision_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None - text_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None @dataclass @@ -181,8 +171,6 @@ class GroundingDINOEncoderOutput(ModelOutput): Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to: - vision and text last hidden states - vision and text intermediate hidden states - - vision and text attentions - - vision and text cross attentions Args: last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -197,32 +185,17 @@ class GroundingDINOEncoderOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. - attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, - used to compute the weighted average in the multi-scale deformable attention heads. - attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, - used to compute the weighted average in the self-attention heads. - cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the - attention softmax, used to compute the weighted average in the bi-attention heads. - cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the - attention softmax, used to compute the weighted average in the bi-attention heads. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. """ last_hidden_state_vision: torch.FloatTensor = None last_hidden_state_text: torch.FloatTensor = None hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None - attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - attentions_text: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None @dataclass @@ -243,18 +216,10 @@ class GroundingDINOModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. - decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, - num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted - average in the self-attention heads. - decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the - weighted average in the cross-attention heads. - decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the - weighted average in the cross-attention heads. + decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention, cross-attention and multi-scale deformable attention heads. encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -267,21 +232,10 @@ class GroundingDINOModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, - used to compute the weighted average in the multi-scale deformable attention heads. - encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, - used to compute the weighted average in the self-attention heads. - encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the - attention softmax, used to compute the weighted average in the bi-attention heads. - encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the bi-attention heads. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are @@ -296,17 +250,12 @@ class GroundingDINOModelOutput(ModelOutput): intermediate_hidden_states: torch.FloatTensor = None intermediate_reference_points: torch.FloatTensor = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None - encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @@ -340,18 +289,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. - decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, - num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted - average in the self-attention heads. - decoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the - weighted average in the cross-attention heads. - decoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. - Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the - weighted average in the cross-attention heads. + decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention, cross-attention and multi-scale deformable attention heads. encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -364,22 +305,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder, after the attention softmax, - used to compute the weighted average in the multi-scale deformable attention heads. - encoder_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder, after the attention softmax, - used to compute the weighted average in the self-attention heads. - encoder_cross_attentions_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each text encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the vision encoder's fusion layer, after the - attention softmax, used to compute the weighted average in the bi-attention heads. - encoder_cross_attentions_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each vision encoder layer) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. Attentions weights of the text encoder's fusion layer, after the - attention softmax, used to compute the weighted average in the bi-attention heads. + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -404,17 +333,16 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): intermediate_hidden_states: Optional[torch.FloatTensor] = None intermediate_reference_points: Optional[torch.FloatTensor] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - decoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - decoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None - encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + # encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + # encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None + # encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None + # encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @@ -1704,6 +1632,7 @@ def forward( encoder_vision_states = () if output_hidden_states else None encoder_text_states = () if output_hidden_states else None + all_attns = () if output_attentions else None all_attn_fused_text = () if output_attentions else None all_attn_fused_vision = () if output_attentions else None all_attn_enhanced_text = () if output_attentions else None @@ -1712,18 +1641,7 @@ def forward( if output_hidden_states: encoder_vision_states += (vision_features,) encoder_text_states += (text_features,) - # INPUTS FOR ENCODER LAYER - # - vision_features: Tensor, - # - vision_position_embedding: Tensor, - # - spatial_shapes: Tensor, - # - level_start_index: Tensor, - # - key_padding_mask: Tensor, - # - reference_points: Tensor, - # - text_features: Optional[Tensor] = None, - # - text_attention_mask: Optional[Tensor] = None, - # - text_position_embedding: Optional[Tensor] = None, - # - text_self_attention_masks: Optional[Tensor] = None, - # - text_position_ids: Optional[Tensor] = None + (vision_features, text_features), attentions = encoder_layer( vision_features=vision_features, vision_position_embedding=vision_position_embedding, @@ -1748,14 +1666,14 @@ def forward( encoder_vision_states += (vision_features,) encoder_text_states += (text_features,) + if output_attentions: + all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable) + if not return_dict: enc_outputs = [ vision_features, text_features, - all_attn_fused_vision, - all_attn_fused_text, - all_attn_enhanced_text, - all_attn_deformable, + all_attns ] return tuple(v for v in enc_outputs if v is not None) return GroundingDINOEncoderOutput( @@ -1763,10 +1681,7 @@ def forward( last_hidden_state_text=text_features, hidden_states_vision=encoder_vision_states, hidden_states_text=encoder_text_states, - cross_attentions_vision=all_attn_fused_vision, - cross_attentions_text=all_attn_fused_text, - attentions_vision=all_attn_deformable, - attentions_text=all_attn_enhanced_text, + attentions=all_attns, ) @@ -1899,6 +1814,7 @@ def forward( # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None + all_attns = () if output_attentions else None all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None intermediate = () @@ -1998,6 +1914,9 @@ def custom_forward(*inputs): if output_hidden_states: all_hidden_states += (hidden_states,) + if output_attentions: + all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision) + if not return_dict: return tuple( v @@ -2006,9 +1925,7 @@ def custom_forward(*inputs): intermediate, intermediate_reference_points, all_hidden_states, - all_self_attns, - all_cross_attns_vision, - all_cross_attns_text, + all_attns, ] if v is not None ) @@ -2017,9 +1934,7 @@ def custom_forward(*inputs): intermediate_hidden_states=intermediate, intermediate_reference_points=intermediate_reference_points, hidden_states=all_hidden_states, - attentions=all_self_attns, - vision_cross_attentions=all_cross_attns_vision, - text_cross_attentions=all_cross_attns_text, + attentions=all_attns, ) @@ -2388,10 +2303,7 @@ def forward( last_hidden_state_text=encoder_outputs[1], hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, - attentions_vision=encoder_outputs[4] if len(encoder_outputs) > 4 else None, - attentions_text=encoder_outputs[5] if len(encoder_outputs) > 5 else None, - cross_attentions_vision=encoder_outputs[6] if len(encoder_outputs) > 6 else None, - cross_attentions_text=encoder_outputs[7] if len(encoder_outputs) > 7 else None, + attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None, ) # Fifth, prepare decoder inputs @@ -2463,16 +2375,11 @@ def forward( intermediate_reference_points=decoder_outputs.intermediate_reference_points, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, - decoder_cross_attentions_vision=decoder_outputs.vision_cross_attentions, - decoder_cross_attentions_text=decoder_outputs.text_cross_attentions, encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, encoder_hidden_states_text=encoder_outputs.hidden_states_text, - encoder_attentions_vision=encoder_outputs.attentions_vision, - encoder_attentions_text=encoder_outputs.attentions_text, - encoder_cross_attentions_vision=encoder_outputs.cross_attentions_vision, - encoder_cross_attentions_text=encoder_outputs.cross_attentions_text, + encoder_attentions=encoder_outputs.attentions, enc_outputs_class=enc_outputs_class, enc_outputs_coord_logits=enc_outputs_coord_logits, ) @@ -2487,7 +2394,7 @@ def forward( ) class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"] def __init__(self, config: GroundingDINOConfig): super().__init__(config) @@ -2595,7 +2502,7 @@ def forward( ) hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] - enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[9] + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[7] init_reference = outputs.init_reference_points if return_dict else outputs[0] inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] @@ -2686,16 +2593,11 @@ def forward( last_hidden_state=outputs.last_hidden_state, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, - decoder_cross_attentions_vision=outputs.decoder_cross_attentions_vision, - decoder_cross_attentions_text=outputs.decoder_cross_attentions_text, encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, encoder_hidden_states_vision=outputs.encoder_hidden_states_vision, encoder_hidden_states_text=outputs.encoder_hidden_states_text, - encoder_attentions_vision=outputs.encoder_attentions_vision, - encoder_attentions_text=outputs.encoder_attentions_text, - encoder_cross_attentions_text=outputs.encoder_cross_attentions_text, - encoder_cross_attentions_vision=outputs.encoder_cross_attentions_vision, + encoder_attentions=outputs.encoder_attentions, intermediate_hidden_states=outputs.intermediate_hidden_states, intermediate_reference_points=outputs.intermediate_reference_points, init_reference_points=outputs.init_reference_points, diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index fc398b0822594b..8592f9036dac10 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -176,13 +176,13 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels) self.parent.assertEqual(result.loss.shape, ()) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) @@ -280,7 +280,7 @@ def test_attention_outputs(self): model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions + attentions = outputs.encoder_attentions[-1] self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config @@ -291,7 +291,7 @@ def test_attention_outputs(self): model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions + attentions = outputs.encoder_attentions[-1] self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -304,7 +304,7 @@ def test_attention_outputs(self): ) out_len = len(outputs) - correct_outlen = 8 + correct_outlen = 10 # loss is at first position if "labels" in inputs_dict: @@ -316,7 +316,7 @@ def test_attention_outputs(self): self.assertEqual(out_len, correct_outlen) # decoder attentions - decoder_attentions = outputs.decoder_attentions + decoder_attentions = outputs.decoder_attentions[0] self.assertIsInstance(decoder_attentions, (list, tuple)) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -325,7 +325,7 @@ def test_attention_outputs(self): ) # cross attentions - cross_attentions = outputs.cross_attentions + cross_attentions = outputs.decoder_attentions[-1] self.assertIsInstance(cross_attentions, (list, tuple)) self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -349,12 +349,12 @@ def test_attention_outputs(self): if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types elif self.is_encoder_decoder: - added_hidden_states = 2 + added_hidden_states = 3 else: added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) - self_attentions = outputs.encoder_attentions + self_attentions = outputs.encoder_attentions[-1] self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -409,18 +409,22 @@ def recursive_check(tuple_object, dict_object): model.to(torch_device) model.eval() + print("Done 1") tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) + print("Done 2") tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs) + print("Done 3") tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + print("Done 4") tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) @@ -458,15 +462,15 @@ def test_retain_grad_hidden_states_attentions(self): # we take the second output since last_hidden_state is the second item output = outputs[1] - encoder_hidden_states = outputs.encoder_hidden_states[0] - encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states = outputs.encoder_hidden_states_vision[0] + encoder_attentions = outputs.encoder_attentions[0][0] encoder_hidden_states.retain_grad() encoder_attentions.retain_grad() - decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions = outputs.decoder_attentions[0][0] decoder_attentions.retain_grad() - cross_attentions = outputs.cross_attentions[0] + cross_attentions = outputs.decoder_attentions[-1][0] cross_attentions.retain_grad() output.flatten()[0].backward(retain_graph=True) @@ -510,7 +514,7 @@ def test_different_timm_backbone(self): expected_shape = ( self.model_tester.batch_size, self.model_tester.num_queries, - self.model_tester.num_labels, + config.max_text_len, ) self.assertEqual(outputs.logits.shape, expected_shape) From 033d9039bbaa15f641b90987d6c752a36bb01f9e Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 18 Oct 2023 17:48:59 -0300 Subject: [PATCH 093/252] Improved tests and modeling --- .../configuration_grounding_dino.py | 15 +++-- .../convert_grounding_dino_to_hf.py | 2 +- .../test_modeling_grounding_dino.py | 66 ++++++------------- 3 files changed, 32 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 6b1f6c1913e7e2..869028e3cc2514 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -370,11 +370,16 @@ def __init__( self.focal_alpha = focal_alpha self.disable_custom_kernels = disable_custom_kernels # Text backbone - self.text_backbone_config = ( - GroundingDINOTextPrenetConfig() - if text_backbone_config is None - else GroundingDINOTextPrenetConfig(**text_backbone_config) - ) + if text_backbone_config is None: + self.text_backbone_config = GroundingDINOTextPrenetConfig() + elif isinstance(text_backbone_config, dict): + self.text_backbone_config = GroundingDINOTextPrenetConfig(**text_backbone_config) + elif isinstance(text_backbone_config, GroundingDINOTextPrenetConfig): + self.text_backbone_config = text_backbone_config + else: + raise ValueError( + f"`text_backbone_config` should be either a `dict` or a `GroundingDINOTextPrenetConfig` instance instead got {type(text_backbone_config)}" + ) self.max_text_len = max_text_len # Text Enhancer self.text_enhancer_dropout = text_enhancer_dropout diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 2ddfcf34b80615..3d362d8e92dd62 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -391,7 +391,7 @@ def convert_grounding_dino_checkpoint(args): [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] ) assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4) + assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3) print("Looks ok!") if pytorch_dump_folder_path is not None: diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 8592f9036dac10..095d768b886ff0 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -20,7 +20,7 @@ import unittest from typing import Dict, List, Tuple -from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available +from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available, GroundingDINOTextPrenetConfig from transformers.file_utils import cached_property from transformers.testing_utils import ( require_timm, @@ -58,14 +58,14 @@ def __init__( use_labels=True, hidden_size=32, num_hidden_layers=2, - num_attention_heads=8, - intermediate_size=8, + num_attention_heads=4, + intermediate_size=4, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, num_queries=12, num_channels=3, - image_size=196, + image_size=64, n_targets=8, num_labels=91, num_feature_levels=4, @@ -128,13 +128,20 @@ def prepare_config_and_inputs(self): def get_config(self): swin_config = SwinConfig( window_size=7, - embed_dim=96, - depths=[2, 2, 18, 2], - num_heads=[3, 6, 12, 24], + embed_dim=16, + depths=[1, 1, 1, 1], + num_heads=[1, 1, 1, 1], image_size=self.image_size, out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) + text_backbone = GroundingDINOTextPrenetConfig( + hidden_size=8, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=8, + max_position_embeddings=8 + ) return GroundingDINOConfig( d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, @@ -153,6 +160,7 @@ def get_config(self): use_timm_backbone=False, backbone_config=swin_config, max_text_len=self.max_text_len, + text_backbone_config=text_backbone ) def prepare_config_and_inputs_for_common(self): @@ -592,11 +600,9 @@ def test_inference_object_detection_head(self): expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device) expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ).to(torch_device) - expected_boxes = torch.tensor( - [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]] + [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) @@ -607,47 +613,17 @@ def test_inference_object_detection_head(self): # verify postprocessing results = image_processor.post_process_object_detection( - outputs, threshold=0.3, target_sizes=[image.size[::-1]] + outputs, threshold=0.35, target_sizes=[image.size[::-1]] )[0] - expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device) + expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device) expected_labels = [17, 17, 75, 75, 63] - expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device) + expected_slice_boxes = torch.tensor([491.1074, 198.5045, 292.5861, 350.6499]).to(torch_device) - self.assertEqual(len(results["scores"]), 5) + self.assertEqual(len(results["scores"]), 2) self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4)) self.assertSequenceEqual(results["labels"].tolist(), expected_labels) self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes)) - def test_inference_object_detection_head_with_box_refine_two_stage(self): - model = GroundingDINOForObjectDetection.from_pretrained( - "SenseTime/deformable-detr-with-box-refine-two-stage" - ).to(torch_device) - - image_processor = self.default_image_processor - image = prepare_img() - encoding = image_processor(images=image, return_tensors="pt").to(torch_device) - pixel_values = encoding["pixel_values"].to(torch_device) - pixel_mask = encoding["pixel_mask"].to(torch_device) - - with torch.no_grad(): - outputs = model(pixel_values, pixel_mask) - - expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) - self.assertEqual(outputs.logits.shape, expected_shape_logits) - - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ).to(torch_device) - expected_boxes = torch.tensor( - [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]] - ).to(torch_device) - - self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) - - expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) - self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) - self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) - @require_torch_gpu def test_inference_object_detection_head_equivalence_cpu_gpu(self): image_processor = self.default_image_processor From baed29afc84954346d4f61e6dd9de56cecc9eba8 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 21 Oct 2023 01:03:07 -0300 Subject: [PATCH 094/252] Improved tests and added image processor --- src/transformers/__init__.py | 2 + .../models/auto/image_processing_auto.py | 2 +- .../models/grounding_dino/__init__.py | 2 + .../convert_grounding_dino_to_hf.py | 3 +- .../image_processing_grounding_dino.py | 1401 +++++++++++++++++ .../processing_grounding_dino.py | 2 +- .../test_modeling_grounding_dino.py | 117 +- 7 files changed, 1499 insertions(+), 30 deletions(-) create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index c73345163a37b9..265b95c17ac5c6 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -979,6 +979,7 @@ _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) + _import_structure["models.grounding_dino"].extend(["GroundingDINOImageProcessor"]) _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) @@ -5074,6 +5075,7 @@ from .models.efficientnet import EfficientNetImageProcessor from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor + from .models.grounding_dino import GroundingDINOImageProcessor from .models.idefics import IdeficsImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index cf33369ef5492d..6399fe192616af 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -66,7 +66,7 @@ ("focalnet", "BitImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), - ("grounding-dino", "DeformableDetrImageProcessor"), + ("grounding-dino", "GroundingDINOImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index 229666382564b8..8002244b4287cd 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -24,6 +24,7 @@ "GroundingDINOTextPrenetConfig", ], "processing_grounding_dino": ["GroundingDINOProcessor"], + "image_processing_grounding_dino": ["GroundingDINOImageProcessor"] } try: @@ -47,6 +48,7 @@ GroundingDINOTextPrenetConfig, ) from .processing_grounding_dino import GroundingDINOProcessor + from .image_processing_grounding_dino import GroundingDINOImageProcessor try: if not is_torch_available(): diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 3d362d8e92dd62..ce48e78e219e8a 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -29,6 +29,7 @@ GroundingDINOConfig, GroundingDINOForObjectDetection, GroundingDINOProcessor, + GroundingDINOImageProcessor ) @@ -369,7 +370,7 @@ def convert_grounding_dino_checkpoint(args): transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) original_pixel_values = transforms(image).unsqueeze(0) - image_processor = DeformableDetrImageProcessor() + image_processor = GroundingDINOImageProcessor() tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py new file mode 100644 index 00000000000000..44c7a8dabc3f1b --- /dev/null +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -0,0 +1,1401 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_coco_detection_annotations, + valid_coco_panoptic_annotations, + valid_images, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotionFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (height <= width and height == size) or (width <= height and width == size): + return height, width + + if width < height: + ow = size + oh = int(size * height / width) + else: + oh = size + ow = int(size * width / height) + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`List[List[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DeformableDetr. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + keypoints = np.asarray(keypoints, dtype=np.float32) + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints[keep] + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: Dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> Dict: + """ + Prepare a coco panoptic annotation for DeformableDetr. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image +def get_segmentation_image( + masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +# Copied from transformers.models.detr.image_processing_detr.get_mask_area +def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: Tuple[int, int], + target_size: Tuple[int, int], + is_thing_map: Dict, + threshold=0.85, +) -> Dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`Tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`Tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +# Copied from transformers.models.detr.image_processing_detr.check_segment_validity +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +# Copied from transformers.models.detr.image_processing_detr.compute_segments +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[Set[int]] = None, + target_size: Tuple[int, int] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: List[Dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: Dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +class GroundingDINOImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + format (`str`, *optional*, defaults to `"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in + the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be + overridden by the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ + def __init__( + self, + format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr + def prepare_annotation( + self, + image: np.ndarray, + target: Dict, + format: Optional[AnnotionFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into DeformableDetr model. + """ + format = format if format is not None else self.format + + if format == AnnotionFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotionFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare + def prepare(self, image, target, return_segmentation_masks=None, masks_path=None): + logger.warning_once( + "The `prepare` method is deprecated and will be removed in a v4.33. " + "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " + "does not return the image anymore.", + ) + target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format) + return image, target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask + def convert_coco_poly_to_mask(self, *args, **kwargs): + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") + return convert_coco_poly_to_mask(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection + def prepare_coco_detection(self, *args, **kwargs): + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") + return prepare_coco_detection_annotation(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic + def prepare_coco_panoptic(self, *args, **kwargs): + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") + return prepare_coco_panoptic_annotation(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or + `height` and `width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> Dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: List[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`np.ndarray`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + padded_images = [ + self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotionFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image after resizing. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. + format (`str` or `AnnotionFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + max_size = None + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, max_size=max_size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_pad = self.do_pad if do_pad is None else do_pad + format = self.format if format is None else format + + if do_resize is not None and size is None: + raise ValueError("Size and max_size must be specified if do_resize is True.") + + if do_rescale is not None and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize is not None and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + images = make_list_of_images(images) + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + format = AnnotionFormat(format) + if annotations is not None: + if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations): + raise ValueError( + "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts " + "(batch of images) with the following keys: `image_id` and `annotations`, with the latter " + "being a list of annotations in the COCO format." + ) + elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations): + raise ValueError( + "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts " + "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with " + "the latter being a list of annotations in the COCO format." + ) + elif format not in SUPPORTED_ANNOTATION_FORMATS: + raise ValueError( + f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}" + ) + + if ( + masks_path is not None + and format == AnnotionFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + if annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + data = self.pad( + images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + data = {"pixel_values": images} + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDINO + def post_process_object_detection( + self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None + ): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + # TODO: (amy) add support for other frameworks + logits, boxes = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + labels = probs.indices + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 10fd6e9834a9c3..131eb2c600a1cd 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -40,7 +40,7 @@ class GroundingDINOProcessor(ProcessorMixin): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "DeformableDetrImageProcessor" + image_processor_class = "GroundingDINOImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer): diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 095d768b886ff0..d06637a3a36ad5 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -46,7 +46,7 @@ if is_vision_available(): from PIL import Image - from transformers import AutoImageProcessor + from transformers import AutoImageProcessor, AutoProcessor class GroundingDINOModelTester: @@ -95,12 +95,15 @@ def __init__( self.max_text_len = max_text_len # we also set the expected seq length for both encoder and decoder - self.encoder_seq_length = ( + self.encoder_seq_length_vision = ( math.ceil(self.image_size / 8) ** 2 + math.ceil(self.image_size / 16) ** 2 + math.ceil(self.image_size / 32) ** 2 + math.ceil(self.image_size / 64) ** 2 ) + + self.encoder_seq_length_text = self.max_text_len + self.decoder_seq_length = self.num_queries def prepare_config_and_inputs(self): @@ -451,6 +454,66 @@ def recursive_check(tuple_object, dict_object): model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states_vision + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_vision + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + hidden_states = outputs.encoder_hidden_states_text + + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_text + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_retain_grad_hidden_states_attentions(self): # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad @@ -576,28 +639,31 @@ def prepare_img(): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") return image +def prepare_text(): + text = "a cat." + return text + @require_timm @require_vision @slow class GroundingDINOModelIntegrationTests(unittest.TestCase): @cached_property - def default_image_processor(self): - return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None + def default_processor(self): + return AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") if is_vision_available() else None def test_inference_object_detection_head(self): - model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device) + model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device) - image_processor = self.default_image_processor + processor = self.default_processor image = prepare_img() - encoding = image_processor(images=image, return_tensors="pt").to(torch_device) - pixel_values = encoding["pixel_values"].to(torch_device) - pixel_mask = encoding["pixel_mask"].to(torch_device) + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device) with torch.no_grad(): - outputs = model(pixel_values, pixel_mask) + outputs = model(**encoding) - expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model)) self.assertEqual(outputs.logits.shape, expected_shape_logits) expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device) @@ -605,50 +671,47 @@ def test_inference_object_detection_head(self): [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] ).to(torch_device) - self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3)) expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) # verify postprocessing - results = image_processor.post_process_object_detection( + results = processor.image_processor.post_process_object_detection( outputs, threshold=0.35, target_sizes=[image.size[::-1]] )[0] expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device) - expected_labels = [17, 17, 75, 75, 63] - expected_slice_boxes = torch.tensor([491.1074, 198.5045, 292.5861, 350.6499]).to(torch_device) + expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device) self.assertEqual(len(results["scores"]), 2) - self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4)) - self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes)) @require_torch_gpu def test_inference_object_detection_head_equivalence_cpu_gpu(self): - image_processor = self.default_image_processor + processor = self.default_processor image = prepare_img() - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - pixel_mask = encoding["pixel_mask"] + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt") # 1. run model on CPU - model = GroundingDINOForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale") + model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") with torch.no_grad(): - cpu_outputs = model(pixel_values, pixel_mask) + cpu_outputs = model(**encoding) # 2. run model on GPU model.to("cuda") - + encoding = {key: value.to("cuda") for key, value in encoding.items()} with torch.no_grad(): - gpu_outputs = model(pixel_values.to("cuda"), pixel_mask.to("cuda")) + gpu_outputs = model(**encoding) # 3. assert equivalence for key in cpu_outputs.keys(): assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4) expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] ) - assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4) + assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3) From 50c5f67da35945ae11ebc276fe0862de839b88f9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 21 Oct 2023 23:47:59 -0300 Subject: [PATCH 095/252] Improved tests inference --- tests/models/grounding_dino/test_modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index d06637a3a36ad5..4bea3e4f4bc817 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -686,7 +686,7 @@ def test_inference_object_detection_head(self): self.assertEqual(len(results["scores"]), 2) self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) - self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes)) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2)) @require_torch_gpu def test_inference_object_detection_head_equivalence_cpu_gpu(self): From d2922e1a25e687aec4a249f3ae8a52d77a31efdf Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 23 Oct 2023 00:25:34 -0300 Subject: [PATCH 096/252] More improvements --- .../models/grounding_dino/modeling_grounding_dino.py | 11 ++++++++--- .../grounding_dino/test_modeling_grounding_dino.py | 4 ---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 9e657f168d3638..f918eebd1457d4 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1673,6 +1673,8 @@ def forward( enc_outputs = [ vision_features, text_features, + encoder_vision_states, + encoder_text_states, all_attns ] return tuple(v for v in enc_outputs if v is not None) @@ -2501,8 +2503,11 @@ def forward( return_dict=return_dict, ) + # index for encoder_last_hidden_state_text + idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] - enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[7] + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] init_reference = outputs.init_reference_points if return_dict else outputs[0] inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] @@ -2561,8 +2566,8 @@ def forward( auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) outputs_loss["auxiliary_outputs"] = auxiliary_outputs if self.config.two_stage: - enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid() - outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord} + enc_outputs_coord = outputs[-1].sigmoid() + outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord} loss_dict = criterion(outputs_loss, labels) # Fourth: compute total loss, as a weighted sum of the various losses diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 4bea3e4f4bc817..69b9dbef5c05b4 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -538,9 +538,6 @@ def test_retain_grad_hidden_states_attentions(self): encoder_hidden_states.retain_grad() encoder_attentions.retain_grad() - decoder_attentions = outputs.decoder_attentions[0][0] - decoder_attentions.retain_grad() - cross_attentions = outputs.decoder_attentions[-1][0] cross_attentions.retain_grad() @@ -548,7 +545,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(encoder_hidden_states.grad) self.assertIsNotNone(encoder_attentions.grad) - self.assertIsNotNone(decoder_attentions.grad) self.assertIsNotNone(cross_attentions.grad) def test_forward_signature(self): From 891c34dd362f77d7162611cfa8a66a7d00199535 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 26 Oct 2023 00:08:07 -0300 Subject: [PATCH 097/252] More test improvements --- .../configuration_grounding_dino.py | 8 ++++++ .../grounding_dino/modeling_grounding_dino.py | 27 ++++++++++++------- .../test_modeling_grounding_dino.py | 2 ++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 869028e3cc2514..5d74a970cfa2e2 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -75,6 +75,8 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Examples: @@ -108,6 +110,7 @@ def __init__( pad_token_id=0, position_embedding_type="absolute", use_cache=True, + init_std=0.02, **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -125,6 +128,7 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache + self.init_std = init_std @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": @@ -253,6 +257,8 @@ class GroundingDINOConfig(PretrainedConfig): generation. positional_embedding_temperature (`float`, *optional*, defaults to 20): The temperature for Sine Positional Embedding that is used together with vision backbone. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Examples: ```python @@ -319,6 +325,7 @@ def __init__( decoder_bbox_embed_share=True, two_stage_bbox_embed_share=False, positional_embedding_temperature=20, + init_std=0.02, **kwargs, ): if backbone_config is not None and use_timm_backbone: @@ -394,6 +401,7 @@ def __init__( if two_stage_bbox_embed_share and not decoder_bbox_embed_share: raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") self.positional_embedding_temperature = positional_embedding_temperature + self.init_std = init_std super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index f918eebd1457d4..6b23c42eaf644b 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1427,6 +1427,8 @@ class GroundingDINOPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, GroundingDINOLearnedPositionEmbedding): nn.init.uniform_(module.row_embeddings.weight) nn.init.uniform_(module.column_embeddings.weight) @@ -1437,21 +1439,26 @@ def _init_weights(self, module): elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)): for p in module.parameters(): if p.dim() > 1: - nn.init.xavier_uniform_(p) - elif isinstance(module, GroundingDINOModel): - nn.init.constant_(module.text_projection.bias.data, 0) - nn.init.xavier_uniform_(module.text_projection.weight.data) - for proj in module.input_proj_vision: - nn.init.xavier_uniform_(proj[0].weight, gain=1) - nn.init.constant_(proj[0].bias, 0) + nn.init.normal_(p, mean=0.0, std=std) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GroundingDINOMLPPredictionHead): + nn.init.constant_(module.layers[-1].weight.data, 0) + nn.init.constant_(module.layers[-1].bias.data, 0) + if hasattr(module, "reference_points") and not self.config.two_stage: nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) nn.init.constant_(module.reference_points.bias.data, 0.0) if hasattr(module, "level_embed"): nn.init.normal_(module.level_embed) - if isinstance(module, GroundingDINOMLPPredictionHead): - nn.init.constant_(module.layers[-1].weight.data, 0) - nn.init.constant_(module.layers[-1].bias.data, 0) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, GroundingDINODecoder): diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 69b9dbef5c05b4..54faebb8227265 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -600,6 +600,8 @@ def test_initialization(self): if ( "level_embed" in name or "sampling_offsets.bias" in name + or "text_param" in name + or "vision_param" in name or "value_proj" in name or "output_proj" in name or "reference_points" in name From eccaec95d0df167c098bdfe8e9c2ae86aa6c7637 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 26 Oct 2023 17:29:51 -0300 Subject: [PATCH 098/252] Fixed last test --- .../test_modeling_grounding_dino.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 54faebb8227265..0f12a2545c6879 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -14,9 +14,10 @@ # limitations under the License. """ Testing suite for the PyTorch Grounding DINO model. """ - +import collections import inspect import math +import re import unittest from typing import Dict, List, Tuple @@ -41,6 +42,7 @@ import torch from transformers import GroundingDINOForObjectDetection, GroundingDINOModel + from transformers.pytorch_utils import id_tensor_storage if is_vision_available(): @@ -628,6 +630,41 @@ def test_two_stage_training(self): loss = model(**inputs).loss loss.backward() + def test_tied_weights_keys(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.tie_word_embeddings = True + for model_class in self.all_model_classes: + model_tied = model_class(config) + + ptrs = collections.defaultdict(list) + for name, tensor in model_tied.state_dict().items(): + ptrs[id_tensor_storage(tensor)].append(name) + + # These are all the pointers of shared tensors. + tied_params = [names for _, names in ptrs.items() if len(names) > 1] + + tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else [] + # Detect we get a hit for each key + for key in tied_weight_keys: + if not any(re.search(key, p) for group in tied_params for p in group): + raise ValueError(f"{key} is not a tied weight key for {model_class}.") + + # Removed tied weights found from tied params -> there should only be one left after + for key in tied_weight_keys: + for i in range(len(tied_params)): + tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] + + # GroundingDINO when sharing weights also uses the shared ones in GroundingDINODecoder + # Therefore, differently from DeformableDetr, we expect the group lens to be 2 + # one for self.bbox_embed in GroundingDINOForObejectDetection and another one + # in the decoder + tied_params = [group for group in tied_params if len(group) > 2] + self.assertListEqual( + tied_params, + [], + f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", + ) + TOLERANCE = 1e-4 From f32be01ec8795d84aeabac923131acae71f8c652 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 26 Oct 2023 19:13:49 -0300 Subject: [PATCH 099/252] Improved docstrings and comments --- .../grounding_dino/modeling_grounding_dino.py | 126 ++++++++---------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 6b23c42eaf644b..b299712746df18 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -339,10 +339,6 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): encoder_hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - # encoder_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - # encoder_attentions_text: Optional[Tuple[torch.FloatTensor]] = None - # encoder_cross_attentions_vision: Optional[Tuple[torch.FloatTensor]] = None - # encoder_cross_attentions_text: Optional[Tuple[torch.FloatTensor]] = None enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None @@ -770,7 +766,6 @@ def forward( return output, attention_weights -# TODO is this an approriate way to name this? class GroundingDINOTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" @@ -1296,27 +1291,6 @@ def forward( self_attn_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ): - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(seq_len, batch, embed_dim)`. - position_embeddings (`torch.FloatTensor`, *optional*): - Position embeddings that are added to the queries and keys in the self-attention layer. - reference_points (`torch.FloatTensor`, *optional*): - Reference points. - spatial_shapes (`torch.LongTensor`, *optional*): - Spatial shapes. - level_start_index (`torch.LongTensor`, *optional*): - Level start index. - encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` - encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size - `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative - values. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ residual = hidden_states # Self Attention @@ -1486,7 +1460,7 @@ def _set_gradient_checkpointing(self, module, value=False): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Padding will be ignored by default should you provide it. - Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`] + Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] for details. pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): @@ -1497,18 +1471,31 @@ def _set_gradient_checkpointing(self, module, value=False): [What are attention masks?](../glossary#attention-mask) - decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*): - Not used by default. Can be used to mask object queries. + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDINOTokenizer.__call__`] for details. + + attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are real (i.e. **not masked**), + - 0 for tokens that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): - Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) - `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: + `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`) + `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you - can choose to directly pass a flattened representation of an image. - decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an - embedded representation. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -1594,8 +1581,8 @@ def forward( Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: - - 1 for pixel features that are real (i.e. **not masked**), - - 0 for pixel features that are padding (i.e. **masked**). + - 0 for pixel features that are real (i.e. **not masked**), + - 1 for pixel features that are padding (i.e. **masked**). [What are attention masks?](../glossary#attention-mask) vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Position embeddings that are added to the queries and keys in each self-attention layer. @@ -1609,8 +1596,8 @@ def forward( Flattened text features that are passed to the encoder. text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: - - 1 for text features that are real (i.e. **not masked**), - - 0 for text features that are padding (i.e. **masked**). + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). [What are attention masks?](../glossary#attention-mask) text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): Position embeddings that are added to the queries and keys in each self-attention layer. @@ -1700,7 +1687,7 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel): The decoder updates the query embeddings through multiple self-attention and cross-attention layers. - Some tweaks for Deformable DETR: + Some tweaks for Grounding DINO: - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. - it also returns a stack of intermediate outputs and reference points from all decoding layers. @@ -1785,14 +1772,18 @@ def forward( Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): The query embeddings that are passed into the decoder. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - of the decoder. - encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected - in `[0, 1]`: - - 1 for pixels that are real (i.e. **not masked**), - - 0 for pixels that are padding (i.e. **masked**). + vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Last hidden state from encoder related to vision feature map. + vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Last hidden state from encoder related to text features. + text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): @@ -1801,7 +1792,10 @@ def forward( Indexes for the start of each feature level. In range `[0, sequence_length]`. valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): Ratio of valid area in each feature level. - + self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`): + Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`: + - 1 for queries that are real (i.e. **not masked**), + - 0 for queries that are padding (i.e. **masked**). output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -2045,8 +2039,6 @@ def __init__(self, config: GroundingDINOConfig): self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) - print("Two stage:", config.two_stage) - if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) @@ -2175,23 +2167,23 @@ def forward( Examples: ```python - >>> from transformers import AutoImageProcessor, GroundingDINOModel + >>> from transformers import AutoProcessor, GroundingDINOModel >>> from PIL import Image >>> import requests >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." - >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny") - >>> model = GroundingDINOModel.from_pretrained("idea-research/grounding-dino-tiny") - - >>> inputs = image_processor(images=image, return_tensors="pt") + >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny") + >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny") + >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state >>> list(last_hidden_states.shape) - [1, 300, 256] + [1, 900, 256] ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -2464,33 +2456,33 @@ def forward( Examples: ```python - >>> from transformers import AutoImageProcessor, GroundingDINOForObjectDetection + >>> from transformers import AutoProcessor, GroundingDINOForObjectDetection >>> from PIL import Image >>> import requests >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." - >>> image_processor = AutoImageProcessor.from_pretrained("idea-research/grounding-dino-tiny") + >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny") >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny") - >>> inputs = image_processor(images=image, return_tensors="pt") + >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) >>> # convert outputs (bounding boxes and class logits) to COCO API >>> target_sizes = torch.tensor([image.size[::-1]]) - >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + >>> results = processor.image_processor.post_process_object_detection(outputs, threshold=0.35, target_sizes=target_sizes)[ ... 0 ... ] >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): ... box = [round(i, 2) for i in box.tolist()] ... print( - ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"Detected {label.item()} with confidence " ... f"{round(score.item(), 3)} at location {box}" ... ) - Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] - Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] - Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83] + Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict From 1c657e2adca10f5d753d307ab1bfbd0ac4cffdbe Mon Sep 17 00:00:00 2001 From: Niels Date: Fri, 27 Oct 2023 20:55:24 +0200 Subject: [PATCH 100/252] Fix style --- .../models/grounding_dino/__init__.py | 4 +- .../convert_grounding_dino_to_hf.py | 3 +- .../image_processing_grounding_dino.py | 12 +-- .../grounding_dino/modeling_grounding_dino.py | 91 +++++++++---------- .../utils/dummy_vision_objects.py | 7 ++ .../test_modeling_grounding_dino.py | 28 +++--- 6 files changed, 77 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index 8002244b4287cd..83622a84513843 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -23,8 +23,8 @@ "GroundingDINOConfig", "GroundingDINOTextPrenetConfig", ], + "image_processing_grounding_dino": ["GroundingDINOImageProcessor"], "processing_grounding_dino": ["GroundingDINOProcessor"], - "image_processing_grounding_dino": ["GroundingDINOImageProcessor"] } try: @@ -47,8 +47,8 @@ GroundingDINOConfig, GroundingDINOTextPrenetConfig, ) - from .processing_grounding_dino import GroundingDINOProcessor from .image_processing_grounding_dino import GroundingDINOImageProcessor + from .processing_grounding_dino import GroundingDINOProcessor try: if not is_torch_available(): diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index ce48e78e219e8a..8883e64814d33b 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -25,11 +25,10 @@ from transformers import ( AutoTokenizer, - DeformableDetrImageProcessor, GroundingDINOConfig, GroundingDINOForObjectDetection, + GroundingDINOImageProcessor, GroundingDINOProcessor, - GroundingDINOImageProcessor ) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 44c7a8dabc3f1b..6c9d86f5a026b5 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -763,7 +763,7 @@ class GroundingDINOImageProcessor(BaseImageProcessor): Constructs a Grounding DINO image processor. Args: - format (`str`, *optional*, defaults to `"coco_detection"`): + format (`str`, *optional*, defaults to `AnnotionFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". do_resize (`bool`, *optional*, defaults to `True`): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be @@ -771,7 +771,7 @@ class GroundingDINOImageProcessor(BaseImageProcessor): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in the `preprocess` method. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the @@ -779,9 +779,9 @@ class GroundingDINOImageProcessor(BaseImageProcessor): rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the `preprocess` method. - do_normalize: Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): Mean values to use when normalizing the image. Can be a single value or a list of values, one for each channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. @@ -1349,11 +1349,11 @@ def post_process_object_detection( self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None ): """ - Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Args: - outputs ([`OwlViTObjectDetectionOutput`]): + outputs ([`GroundingDINOObjectDetectionOutput`]): Raw outputs of the model. threshold (`float`, *optional*): Score threshold to keep object detection predictions. diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index b299712746df18..8634bcecc536bd 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -153,9 +153,9 @@ class GroundingDINODecoderOutput(ModelOutput): shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the self-attention, cross-attention and multi-scale deformable attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. """ last_hidden_state: torch.FloatTensor = None @@ -186,9 +186,10 @@ class GroundingDINOEncoderOutput(ModelOutput): of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. """ last_hidden_state_vision: torch.FloatTensor = None @@ -217,9 +218,9 @@ class GroundingDINOModelOutput(ModelOutput): shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the self-attention, cross-attention and multi-scale deformable attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -233,14 +234,15 @@ class GroundingDINOModelOutput(ModelOutput): of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. - attention softmax, used to compute the weighted average in the bi-attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the + bi-attention heads. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): - Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are - picked as region proposals in the first stage. Output of bounding box binary classification (i.e. - foreground and background). + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): Logits of predicted bounding boxes coordinates in the first stage. """ @@ -290,9 +292,9 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the self-attention, cross-attention and multi-scale deformable attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -306,9 +308,10 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in - the text-vision attention, vision-text attention, text-enhancer (self-attention) and multi-scale deformable attention heads. + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): Stacked intermediate hidden states (output of each layer of the decoder). intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): @@ -316,9 +319,9 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): Initial reference points sent through the Transformer decoder. enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): - Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are - picked as region proposals in the first stage. Output of bounding box binary classification (i.e. - foreground and background). + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): Logits of predicted bounding boxes coordinates in the first stage. """ @@ -1427,7 +1430,7 @@ def _init_weights(self, module): elif isinstance(module, GroundingDINOMLPPredictionHead): nn.init.constant_(module.layers[-1].weight.data, 0) nn.init.constant_(module.layers[-1].bias.data, 0) - + if hasattr(module, "reference_points") and not self.config.two_stage: nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) nn.init.constant_(module.reference_points.bias.data, 0.0) @@ -1460,8 +1463,8 @@ def _set_gradient_checkpointing(self, module, value=False): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Padding will be ignored by default should you provide it. - Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] - for details. + Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] for + details. pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: @@ -1492,10 +1495,11 @@ def _set_gradient_checkpointing(self, module, value=False): [What are token type IDs?](../glossary#token-type-ids) encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): - Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: + Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`) - `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of - hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence + of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the + decoder. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -1664,13 +1668,7 @@ def forward( all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable) if not return_dict: - enc_outputs = [ - vision_features, - text_features, - encoder_vision_states, - encoder_text_states, - all_attns - ] + enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns] return tuple(v for v in enc_outputs if v is not None) return GroundingDINOEncoderOutput( last_hidden_state_vision=vision_features, @@ -2042,7 +2040,11 @@ def __init__(self, config: GroundingDINOConfig): if config.two_stage: self.enc_output = nn.Linear(config.d_model, config.d_model) self.enc_output_norm = nn.LayerNorm(config.d_model) - if config.two_stage_bbox_embed_share and config.decoder_bbox_embed_share and self.decoder.bbox_embed is not None: + if ( + config.two_stage_bbox_embed_share + and config.decoder_bbox_embed_share + and self.decoder.bbox_embed is not None + ): self.encoder_output_bbox_embed = self.decoder.bbox_embed else: self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead( @@ -2472,15 +2474,12 @@ def forward( >>> # convert outputs (bounding boxes and class logits) to COCO API >>> target_sizes = torch.tensor([image.size[::-1]]) - >>> results = processor.image_processor.post_process_object_detection(outputs, threshold=0.35, target_sizes=target_sizes)[ - ... 0 - ... ] + >>> results = processor.image_processor.post_process_object_detection( + ... outputs, threshold=0.35, target_sizes=target_sizes + ... )[0] >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): ... box = [round(i, 2) for i in box.tolist()] - ... print( - ... f"Detected {label.item()} with confidence " - ... f"{round(score.item(), 3)} at location {box}" - ... ) + ... print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}") Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83] Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89] ```""" diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index c0c39b57d096bc..27425117909d3a 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -233,6 +233,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class GroundingDINOImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class IdeficsImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 0f12a2545c6879..220f1a6231ec9c 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -21,7 +21,13 @@ import unittest from typing import Dict, List, Tuple -from transformers import GroundingDINOConfig, SwinConfig, is_torch_available, is_vision_available, GroundingDINOTextPrenetConfig +from transformers import ( + GroundingDINOConfig, + GroundingDINOTextPrenetConfig, + SwinConfig, + is_torch_available, + is_vision_available, +) from transformers.file_utils import cached_property from transformers.testing_utils import ( require_timm, @@ -48,7 +54,7 @@ if is_vision_available(): from PIL import Image - from transformers import AutoImageProcessor, AutoProcessor + from transformers import AutoProcessor class GroundingDINOModelTester: @@ -141,11 +147,7 @@ def get_config(self): out_indices=[2, 3, 4], ) text_backbone = GroundingDINOTextPrenetConfig( - hidden_size=8, - num_hidden_layers=2, - num_attention_heads=2, - intermediate_size=8, - max_position_embeddings=8 + hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8 ) return GroundingDINOConfig( d_model=self.hidden_size, @@ -165,7 +167,7 @@ def get_config(self): use_timm_backbone=False, backbone_config=swin_config, max_text_len=self.max_text_len, - text_backbone_config=text_backbone + text_backbone_config=text_backbone, ) def prepare_config_and_inputs_for_common(self): @@ -465,7 +467,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = outputs.encoder_hidden_states_vision + hidden_states = outputs.encoder_hidden_states_vision expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 @@ -515,7 +517,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - def test_retain_grad_hidden_states_attentions(self): # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad @@ -658,7 +659,7 @@ def test_tied_weights_keys(self): # Therefore, differently from DeformableDetr, we expect the group lens to be 2 # one for self.bbox_embed in GroundingDINOForObejectDetection and another one # in the decoder - tied_params = [group for group in tied_params if len(group) > 2] + tied_params = [group for group in tied_params if len(group) > 2] self.assertListEqual( tied_params, [], @@ -674,6 +675,7 @@ def prepare_img(): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") return image + def prepare_text(): text = "a cat." return text @@ -701,7 +703,9 @@ def test_inference_object_detection_head(self): expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model)) self.assertEqual(outputs.logits.shape, expected_shape_logits) - expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]).to(torch_device) + expected_boxes = torch.tensor( + [[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]] + ).to(torch_device) expected_logits = torch.tensor( [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] ).to(torch_device) From 1202ce8ee1217e71386539cb4c7a38bcfc08eb06 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Wed, 1 Nov 2023 10:59:46 -0300 Subject: [PATCH 101/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8634bcecc536bd..87a13d5aa08a39 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -840,7 +840,7 @@ def __init__(self, config): if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." ) self.scale = self.head_dim ** (-0.5) self.dropout = dropout From d62dd114afb55b06e7e727cf82fb09dd10f24561 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:00:06 -0300 Subject: [PATCH 102/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 87a13d5aa08a39..b1c198d0fcb829 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -855,8 +855,8 @@ def __init__(self, config): self._reset_parameters() - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def _reset_parameters(self): nn.init.xavier_uniform_(self.vision_proj.weight) From bbf873b3817850fe85658a2e25b06688aab7ef71 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:00:35 -0300 Subject: [PATCH 103/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index b1c198d0fcb829..1fe4357eb02f0a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1945,7 +1945,7 @@ def custom_forward(*inputs): def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: """Generate attention mask between each pair of special tokens and positional ids. Args: - input_ids (torch.LongTensor): input ids. Shape: [bs, num_token] + input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token] Returns: Tuple[torch.Tensor]: attention mask between each special tokens and position_ids """ From c69b8a2da070083c528d311bd301d53b4714b78d Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:00:46 -0300 Subject: [PATCH 104/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 1fe4357eb02f0a..77f558cb6cbc6a 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1947,7 +1947,7 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen Args: input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token] Returns: - Tuple[torch.Tensor]: attention mask between each special tokens and position_ids + `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids """ bs, num_token = input_ids.shape # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens From 274752c2fda16270fca712168e3e4c64d686ea79 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:01:14 -0300 Subject: [PATCH 105/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 77f558cb6cbc6a..8ca460031f6e11 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2103,9 +2103,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) """Generate the encoder output proposals from encoded enc_output. Args: - enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. - padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. - spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps. + enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. + padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. + spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. Returns: `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. From 91373e0964f47fcb14968da22a8b1b0d83fb36ac Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 11:06:35 -0300 Subject: [PATCH 106/252] Better naming --- .../models/grounding_dino/modeling_grounding_dino.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 8ca460031f6e11..e696b2137bb644 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1949,9 +1949,9 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen Returns: `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids """ - bs, num_token = input_ids.shape - # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens - special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + batch_size, num_token = input_ids.shape + # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() for special_token in SPECIAL_TOKENS: special_tokens_mask |= input_ids == special_token @@ -1959,8 +1959,8 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen idxs = torch.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) - position_ids = torch.zeros((bs, num_token), device=input_ids.device) + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) + position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) previous_col = 0 for i in range(idxs.shape[0]): row, col = idxs[i] From 49458838fb3695173586e04932f54a65d5fa7202 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 11:21:57 -0300 Subject: [PATCH 107/252] Better naming --- .../models/grounding_dino/modeling_grounding_dino.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index e696b2137bb644..36ae85b86ac9de 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2116,9 +2116,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) """ batch_size = enc_output.shape[0] proposals = [] - _cur = 0 + current_position = 0 for level, (height, width) in enumerate(spatial_shapes): - mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(batch_size, height, width, 1) valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) @@ -2134,7 +2134,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) proposals.append(proposal) - _cur += height * width + current_position += height * width output_proposals = torch.cat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) From 5882f5fcb8cd7ac29c0ffc448824faa041940f9b Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 14:58:06 -0300 Subject: [PATCH 108/252] Added Copied statement --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 36ae85b86ac9de..10338735006b50 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -67,7 +67,7 @@ if is_vision_available(): from transformers.image_transforms import center_to_corners_format - +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction class MultiScaleDeformableAttentionFunction(Function): @staticmethod def forward( From c96a1a1fcb6a7e6510274a708f05165a7fcd49c3 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 15:06:37 -0300 Subject: [PATCH 109/252] Added Copied statement --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 10338735006b50..5e9d775c9417c6 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -595,7 +595,7 @@ def build_position_encoding(config): return position_embedding - +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention def multi_scale_deformable_attention( value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor ) -> Tensor: From 558ad8776f7a610c925098e3f85a30bf6c44e38f Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 15:13:18 -0300 Subject: [PATCH 110/252] Moved param init from GroundingDINOBiMultiHeadAttention --- .../grounding_dino/modeling_grounding_dino.py | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 5e9d775c9417c6..4a5a13d4a74d4b 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -853,25 +853,9 @@ def __init__(self, config): self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) - self._reset_parameters() - def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - def _reset_parameters(self): - nn.init.xavier_uniform_(self.vision_proj.weight) - self.vision_proj.bias.data.fill_(0) - nn.init.xavier_uniform_(self.text_proj.weight) - self.text_proj.bias.data.fill_(0) - nn.init.xavier_uniform_(self.values_vision_proj.weight) - self.values_vision_proj.bias.data.fill_(0) - nn.init.xavier_uniform_(self.values_text_proj.weight) - self.values_text_proj.bias.data.fill_(0) - nn.init.xavier_uniform_(self.out_vision_proj.weight) - self.out_vision_proj.bias.data.fill_(0) - nn.init.xavier_uniform_(self.out_text_proj.weight) - self.out_text_proj.bias.data.fill_(0) - def forward( self, vision_features: Tensor, @@ -1412,7 +1396,18 @@ def _init_weights(self, module): elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): module._reset_parameters() elif isinstance(module, GroundingDINOBiMultiHeadAttention): - module._reset_parameters() + nn.init.xavier_uniform_(module.vision_proj.weight) + module.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.text_proj.weight) + module.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_vision_proj.weight) + module.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_text_proj.weight) + module.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_vision_proj.weight) + module.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_text_proj.weight) + module.out_text_proj.bias.data.fill_(0) elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)): for p in module.parameters(): if p.dim() > 1: From 5c32bdc9540176840abe93fee2d338d550e1f2a1 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 15:28:44 -0300 Subject: [PATCH 111/252] Better naming --- .../models/grounding_dino/modeling_grounding_dino.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 4a5a13d4a74d4b..38683d76ba62ea 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -804,9 +804,9 @@ def forward( # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) - q = k = self.with_pos_embed(hidden_states, position_embeddings) + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) attention_output, attention_weights = self.self_attn( - query=q, key=k, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False + query=queries, key=keys, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output From c561087e944d3366b01a35ba5744fb465943be77 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Wed, 1 Nov 2023 15:32:52 -0300 Subject: [PATCH 112/252] Fixing clamp style --- .../grounding_dino/modeling_grounding_dino.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 38683d76ba62ea..6d49851104b482 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -904,23 +904,18 @@ def forward( ) attn_weights = attn_weights - attn_weights.max() - - attn_weights = torch.clamp( - attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range + # Do not increase -50000/50000, data type half has quite limited range attn_weights = torch.clamp( - attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + attn_weights, min=-50000, max=50000 + ) attn_weights_T = attn_weights.transpose(1, 2) text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + # Do not increase -50000/50000, data type half has quite limited range text_attn_weights = torch.clamp( - text_attn_weights, min=-50000 - ) # Do not increase -50000, data type half has quite limited range - text_attn_weights = torch.clamp( - text_attn_weights, max=50000 - ) # Do not increase 50000, data type half has quite limited range + text_attn_weights, min=-50000, max=50000 + ) # mask vison for language if vision_attention_mask is not None: From 07d4c62dc4adefcb2a27a39634392df824a8e272 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 11:08:34 -0300 Subject: [PATCH 113/252] Better naming --- .../models/grounding_dino/modeling_grounding_dino.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 6d49851104b482..16972223f94046 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -909,8 +909,8 @@ def forward( attn_weights, min=-50000, max=50000 ) - attn_weights_T = attn_weights.transpose(1, 2) - text_attn_weights = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + attn_weights_transposed = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0] # Do not increase -50000/50000, data type half has quite limited range text_attn_weights = torch.clamp( From ba37183c4812a2afc678a0ef417f4d4aeb0d35ce Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:10:34 -0300 Subject: [PATCH 114/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 16972223f94046..02bca0495571e0 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -917,7 +917,7 @@ def forward( text_attn_weights, min=-50000, max=50000 ) - # mask vison for language + # mask vision for language if vision_attention_mask is not None: vision_attention_mask = ( vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) From c746e1d96c9e18c113cd3a1748dc5e3741234c1d Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:14:27 -0300 Subject: [PATCH 115/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 02bca0495571e0..2091a95e88b708 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1156,13 +1156,10 @@ def get_text_position_embeddings( ) -> Tensor: batch_size, seq_length, _ = text_features.shape if text_position_embedding is None and text_position_ids is None: - text_position_embedding = ( - torch.arange(seq_length, device=text_features.device) - .float() - .unsqueeze(0) - .unsqueeze(-1) - .repeat(batch_size, 1, 1) - ) + text_position_embedding = torch.arange(seq_length, device=text_features.device) + text_position_embedding = text_position_embedding.float() + text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1) + text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1) text_position_embedding = get_sine_pos_embed( text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False ) From 07b260dd58471d19723510c1fa08c1f786c797b3 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:33:37 -0300 Subject: [PATCH 116/252] Update src/transformers/models/grounding_dino/configuration_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/configuration_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 5d74a970cfa2e2..1d9ba9a25e7082 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -385,7 +385,7 @@ def __init__( self.text_backbone_config = text_backbone_config else: raise ValueError( - f"`text_backbone_config` should be either a `dict` or a `GroundingDINOTextPrenetConfig` instance instead got {type(text_backbone_config)}" + f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextPrenetConfig`. Received {type(text_backbone_config)} instead." ) self.max_text_len = max_text_len # Text Enhancer From 898e0727c73ee79184b8567b1bc2af87a56bc1ba Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:36:18 -0300 Subject: [PATCH 117/252] Update src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 8883e64814d33b..fade922f8e5370 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -100,7 +100,7 @@ def create_rename_keys(state_dict, config): f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermidiate + # intermediate rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", From 34b36a3a68c682b085d2314e99910e190f4cd167 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:37:38 -0300 Subject: [PATCH 118/252] Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2091a95e88b708..9e4aaeaa8f5b9d 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -281,7 +281,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. - auxiliary_outputs (`list[Dict]`, *optional*): + auxiliary_outputs (`List[Dict]`, *optional*): Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. From e14d6aea74f9857cfb583c2a05c7eeb7785b3e0d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 11:39:42 -0300 Subject: [PATCH 119/252] Improving conversion script --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index fade922f8e5370..0737b060a6e379 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -94,13 +94,11 @@ def create_rename_keys(state_dict, config): # attention rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - # rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_index", - # f"encoder.layers.{layer}.blocks.{block}.attention.relative_position_index")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermediate + # intermidiate rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", @@ -238,10 +236,6 @@ def create_rename_keys(state_dict, config): target_prefix_decoder + target_name)) ########################################## DECODER - END - #TODO convert head - ########################################## HEAD - START - ########################################## HEAD - END - ########################################## Additional - START for layer_name, params in state_dict.items(): #### TEXT BACKBONE From f867e5081eb72a13ece9f50e0a6c200dfe5426d0 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 11:44:59 -0300 Subject: [PATCH 120/252] Improved config --- .../configuration_grounding_dino.py | 17 +++++------------ .../grounding_dino/modeling_grounding_dino.py | 4 ++-- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 1d9ba9a25e7082..b7fee34a46b262 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -220,9 +220,6 @@ class GroundingDINOConfig(PretrainedConfig): two_stage (`bool`, *optional*, defaults to `True`): Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. - with_box_refine (`bool`, *optional*, defaults to `True`): - Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes - based on the predictions from the previous layer. class_cost (`float`, *optional*, defaults to 1): Relative weight of the classification error in the Hungarian matching cost. bbox_cost (`float`, *optional*, defaults to 5): @@ -307,12 +304,11 @@ def __init__( encoder_n_points=4, decoder_n_points=4, two_stage=True, - with_box_refine=True, - class_cost=1, - bbox_cost=5, - giou_cost=2, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, + class_cost=1., + bbox_cost=5., + giou_cost=2., + bbox_loss_coefficient=5., + giou_loss_coefficient=2., focal_alpha=0.25, disable_custom_kernels=False, # other parameters @@ -364,9 +360,6 @@ def __init__( self.encoder_n_points = encoder_n_points self.decoder_n_points = decoder_n_points self.two_stage = two_stage - self.with_box_refine = with_box_refine - if two_stage is True and with_box_refine is False: - raise ValueError("If two_stage is True, with_box_refine must be True.") # Hungarian matcher self.class_cost = class_cost self.bbox_cost = bbox_cost diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 9e4aaeaa8f5b9d..c095c0c1911e21 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -318,11 +318,11 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): Stacked intermediate reference points (reference points of each layer of the decoder). init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): Initial reference points sent through the Transformer decoder. - enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and background). - enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): Logits of predicted bounding boxes coordinates in the first stage. """ From fc105bee2456ce58272090c3072d4199649bf7b1 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 11:47:43 -0300 Subject: [PATCH 121/252] Improved naming --- .../grounding_dino/modeling_grounding_dino.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index c095c0c1911e21..fc85e4f212cceb 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -874,21 +874,21 @@ def forward( Returns: _type_: _description_ """ - bsz, tgt_len, _ = vision_features.size() + batch_size, tgt_len, _ = vision_features.size() vision_query_states = self.vision_proj(vision_features) * self.scale - vision_query_states = self._shape(vision_query_states, tgt_len, bsz) + vision_query_states = self._shape(vision_query_states, tgt_len, batch_size) text_key_states = self.text_proj(text_features) - text_key_states = self._shape(text_key_states, -1, bsz) + text_key_states = self._shape(text_key_states, -1, batch_size) vision_value_states = self.values_vision_proj(vision_features) - vision_value_states = self._shape(vision_value_states, -1, bsz) + vision_value_states = self._shape(vision_value_states, -1, batch_size) text_value_states = self.values_text_proj(text_features) - text_value_states = self._shape(text_value_states, -1, bsz) + text_value_states = self._shape(text_value_states, -1, batch_size) - proj_shape = (bsz * self.num_heads, -1, self.head_dim) + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) vision_query_states = vision_query_states.view(*proj_shape) text_key_states = text_key_states.view(*proj_shape) @@ -898,9 +898,9 @@ def forward( src_len = text_key_states.size(1) attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len): raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" ) attn_weights = attn_weights - attn_weights.max() @@ -938,23 +938,23 @@ def forward( vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) text_attn_output = torch.bmm(text_attn_probs, vision_value_states) - if vision_attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim): raise ValueError( - f"`vision_attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" ) - if text_attn_output.size() != (bsz * self.num_heads, src_len, self.head_dim): + if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim): raise ValueError( - f"`text_attn_output` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" ) - vision_attn_output = vision_attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim) vision_attn_output = vision_attn_output.transpose(1, 2) - vision_attn_output = vision_attn_output.reshape(bsz, tgt_len, self.embed_dim) + vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim) - text_attn_output = text_attn_output.view(bsz, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim) text_attn_output = text_attn_output.transpose(1, 2) - text_attn_output = text_attn_output.reshape(bsz, src_len, self.embed_dim) + text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim) vision_attn_output = self.out_vision_proj(vision_attn_output) text_attn_output = self.out_text_proj(text_attn_output) From ed1176ef3eb656ef97a15bd3f104bdbb39347b1a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 11:52:27 -0300 Subject: [PATCH 122/252] Improved naming again --- .../models/grounding_dino/modeling_grounding_dino.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index fc85e4f212cceb..96f89505bfc1af 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -346,8 +346,8 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): enc_outputs_coord_logits: Optional[torch.FloatTensor] = None -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) +def _get_clones(module, num_copies): + return nn.ModuleList([copy.deepcopy(module) for i in range(num_copies)]) def inverse_sigmoid(x, eps=1e-5): From ef5c90fd3453f50b180df56ac7f867fff0840890 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 16:26:05 -0300 Subject: [PATCH 123/252] Improved grouding-dino.md --- docs/source/en/model_doc/grounding-dino.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index 03c3549c32cb5f..ef41448d3d06ef 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -26,11 +26,23 @@ The abstract from the paper is the following: Tips: - +- One can use [`GroundingDINOProcessor`] to prepare image-text pairs for the model. + + + + Grounding DINO overview. Taken from the original paper. This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO). + +## GroundingDINOImageProcessor + +[[autodoc]] GroundingDINOImageProcessor + - preprocess + - post_process_object_detection + ## GroundingDINOProcessor [[autodoc]] GroundingDINOProcessor From b2fd8687e33e84f12042b4f377c438e47840f14c Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 2 Nov 2023 16:30:37 -0300 Subject: [PATCH 124/252] Moved grounding dino to multimodal --- docs/source/en/_toctree.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b80f2f093699a5..0ec9808abe4ca8 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -523,8 +523,6 @@ title: FocalNet - local: model_doc/glpn title: GLPN - - local: model_doc/grounding-dino - title: Grounding DINO - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit @@ -669,6 +667,8 @@ title: FLAVA - local: model_doc/git title: GIT + - local: model_doc/grounding-dino + title: Grounding DINO - local: model_doc/groupvit title: GroupViT - local: model_doc/idefics From c23497ccba3186fb181f57800a3e89c1013355b0 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Fri, 3 Nov 2023 10:34:26 -0300 Subject: [PATCH 125/252] Update src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 0737b060a6e379..4a2c5eb5e21e7d 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -98,7 +98,7 @@ def create_rename_keys(state_dict, config): f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermidiate + # intermediate rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", From a729a389db5932a68b38afccb31b28dcdfc96203 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 3 Nov 2023 11:12:01 -0300 Subject: [PATCH 126/252] Fixed docstrings and style --- .../configuration_grounding_dino.py | 10 +- .../image_processing_grounding_dino.py | 5 +- .../grounding_dino/modeling_grounding_dino.py | 122 ++++++++++++++---- 3 files changed, 104 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index b7fee34a46b262..6ca2114bd3f560 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -304,11 +304,11 @@ def __init__( encoder_n_points=4, decoder_n_points=4, two_stage=True, - class_cost=1., - bbox_cost=5., - giou_cost=2., - bbox_loss_coefficient=5., - giou_loss_coefficient=2., + class_cost=1.0, + bbox_cost=5.0, + giou_cost=2.0, + bbox_loss_coefficient=5.0, + giou_loss_coefficient=2.0, focal_alpha=0.25, disable_custom_kernels=False, # other parameters diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 6c9d86f5a026b5..d45ec72dd3fb79 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -778,9 +778,8 @@ class GroundingDINOImageProcessor(BaseImageProcessor): `do_rescale` parameter in the `preprocess` method. rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the - `preprocess` method. - Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the - `preprocess` method. + `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize` + parameter in the `preprocess` method. do_normalize (`bool`, *optional*, defaults to `True`): image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): Mean values to use when normalizing the image. Can be a single value or a list of values, one for each diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 96f89505bfc1af..a7cfad6db54e0e 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -67,6 +67,7 @@ if is_vision_available(): from transformers.image_transforms import center_to_corners_format + # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction class MultiScaleDeformableAttentionFunction(Function): @staticmethod @@ -595,6 +596,7 @@ def build_position_encoding(config): return position_embedding + # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention def multi_scale_deformable_attention( value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor @@ -796,10 +798,31 @@ def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Ten def forward( self, - hidden_states: Tensor, - attention_masks: Optional[Tensor] = None, - position_embeddings: Optional[Tensor] = None, - ): # repeat attn mask + hidden_states: torch.FloatTensor, + attention_masks: Optional[torch.BoolTensor] = None, + position_embeddings: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + """Text self-attention to enhance projection of text features generated by + the text encoder (GroundingDINOTextPrenet) within GroundingDINOEncoderLayer + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): + Text features generated by the text encoder. + attention_masks (`torch.BoolTensor`, *optional*): + Attention mask for text self-attention. False for real tokens and True for padding tokens. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings to be added to the hidden states. + + Returns: + `tuple(torch.FloatTensor)` comprising two elements: + - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- + Output of the text self-attention layer. + - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length, + sequence_length)`) -- + Attention weights of the text self-attention layer. + """ + + # repeat attn mask if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: # bs, num_q, num_k attention_masks = attention_masks.repeat(self.num_heads, 1, 1) @@ -858,21 +881,37 @@ def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): def forward( self, - vision_features: Tensor, - text_features: Tensor, - vision_attention_mask: Optional[Tensor] = None, - text_attention_mask: Optional[Tensor] = None, - ): - """_summary_ + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + vision_attention_mask: Optional[torch.BoolTensor] = None, + text_attention_mask: Optional[torch.BoolTensor] = None, + ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image-to-text and text-to-image cross-attention Args: - vision_features Tensor: bs, n_img, dim - text_features Tensor: bs, n_text, dim - vision_attention_mask (Tensor, optional): _description_. bs, n_img - text_attention_mask (Tensor, optional): _description_. bs, n_text + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + vision_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + text_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. Returns: - _type_: _description_ + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention + output and weights: + - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`) + -- + Output of the image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) -- + Output of the text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. """ batch_size, tgt_len, _ = vision_features.size() @@ -905,17 +944,13 @@ def forward( attn_weights = attn_weights - attn_weights.max() # Do not increase -50000/50000, data type half has quite limited range - attn_weights = torch.clamp( - attn_weights, min=-50000, max=50000 - ) + attn_weights = torch.clamp(attn_weights, min=-50000, max=50000) attn_weights_transposed = attn_weights.transpose(1, 2) text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0] # Do not increase -50000/50000, data type half has quite limited range - text_attn_weights = torch.clamp( - text_attn_weights, min=-50000, max=50000 - ) + text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000) # mask vision for language if vision_attention_mask is not None: @@ -1013,7 +1048,39 @@ def __init__(self, config, init_values=1e-4): self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) - def forward(self, vision_features, text_features, attention_mask_vision=None, attention_mask_text=None): + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + attention_mask_vision: Optional[torch.BoolTensor] = None, + attention_mask_text: Optional[torch.BoolTensor] = None, + ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image and text features fusion + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + attention_mask_vision (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + attention_mask_text (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced + feature and attention output and weights: + - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) -- + Updated vision features with attention output from image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) -- + Updated text features with attention output from text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ vision_features = self.layer_norm_vision(vision_features) text_features = self.layer_norm_text(text_features) (delta_v, vision_attn), (delta_t, text_attn) = self.attn( @@ -1932,9 +1999,12 @@ def custom_forward(*inputs): def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: """Generate attention mask between each pair of special tokens and positional ids. Args: - input_ids (`torch.LongTensor`): input ids. Shape: [batch_size, num_token] + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Returns: - `Tuple[torch.Tensor]`: attention mask between each special tokens and position_ids + `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: + - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) + - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) """ batch_size, num_token = input_ids.shape # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens @@ -2105,7 +2175,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) proposals = [] current_position = 0 for level, (height, width) in enumerate(spatial_shapes): - mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view(batch_size, height, width, 1) + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view( + batch_size, height, width, 1 + ) valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) From aafcc34eb9550eda7e0126b07083b10e318127e4 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 12:04:24 +0100 Subject: [PATCH 127/252] Fix docstrings --- .../grounding_dino/configuration_grounding_dino.py | 10 +++++----- .../grounding_dino/image_processing_grounding_dino.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 6ca2114bd3f560..16da12a7eaf676 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -220,15 +220,15 @@ class GroundingDINOConfig(PretrainedConfig): two_stage (`bool`, *optional*, defaults to `True`): Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. - class_cost (`float`, *optional*, defaults to 1): + class_cost (`float`, *optional*, defaults to 1.0): Relative weight of the classification error in the Hungarian matching cost. - bbox_cost (`float`, *optional*, defaults to 5): + bbox_cost (`float`, *optional*, defaults to 5.0): Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. - giou_cost (`float`, *optional*, defaults to 2): + giou_cost (`float`, *optional*, defaults to 2.0): Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. - bbox_loss_coefficient (`float`, *optional*, defaults to 5): + bbox_loss_coefficient (`float`, *optional*, defaults to 5.0): Relative weight of the L1 bounding box loss in the object detection loss. - giou_loss_coefficient (`float`, *optional*, defaults to 2): + giou_loss_coefficient (`float`, *optional*, defaults to 2.0): Relative weight of the generalized IoU loss in the object detection loss. focal_alpha (`float`, *optional*, defaults to 0.25): Alpha parameter in the focal loss. diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index d45ec72dd3fb79..f415e5e1f4a57b 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -780,7 +780,9 @@ class GroundingDINOImageProcessor(BaseImageProcessor): Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` method. - do_normalize (`bool`, *optional*, defaults to `True`): + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): Mean values to use when normalizing the image. Can be a single value or a list of values, one for each channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. From e4bad9b93e14b1726766d4cd8ac6ca70dfef0f30 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 13:41:40 +0100 Subject: [PATCH 128/252] Remove timm attributes --- .../configuration_grounding_dino.py | 49 ++++--------------- .../convert_grounding_dino_to_hf.py | 19 ++++--- .../grounding_dino/modeling_grounding_dino.py | 42 +++------------- 3 files changed, 28 insertions(+), 82 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 16da12a7eaf676..8bf480e7d99705 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -160,16 +160,10 @@ class GroundingDINOConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - use_timm_backbone (`bool`, *optional*, defaults to `False`): - Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] - API. - backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `{'model_type': 'swin'}`): - The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which - case it will default to `ResNetConfig()`. + backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): + The configuration of the backbone model. text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`): The configuration of the text backbone model. Should be a bert-like config. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects [`GroundingDINOModel`] can detect in a single image. @@ -202,15 +196,6 @@ class GroundingDINOConfig(PretrainedConfig): Whether auxiliary decoding losses (loss at each decoder layer) are to be used. position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. - backbone (`str`, *optional*, defaults to `"swin"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). - use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. - dilation (`bool`, *optional*, defaults to `False`): - Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when - `use_timm_backbone` = `True`. num_feature_levels (`int`, *optional*, defaults to 4): The number of input feature levels. encoder_n_points (`int`, *optional*, defaults to 4): @@ -278,10 +263,8 @@ class GroundingDINOConfig(PretrainedConfig): def __init__( self, - use_timm_backbone=False, - backbone_config={"model_type": "swin"}, + backbone_config=None, text_backbone_config=None, - num_channels=3, num_queries=900, encoder_layers=6, encoder_ffn_dim=2048, @@ -297,9 +280,6 @@ def __init__( activation_dropout=0.0, auxiliary_loss=False, position_embedding_type="sine", - backbone="swin", - use_pretrained_backbone=True, - dilation=False, num_feature_levels=4, encoder_n_points=4, decoder_n_points=4, @@ -324,20 +304,14 @@ def __init__( init_std=0.02, **kwargs, ): - if backbone_config is not None and use_timm_backbone: - raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") - - if not use_timm_backbone: - if backbone_config is None: - logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") - backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) - elif isinstance(backbone_config, dict): - backbone_model_type = backbone_config.get("model_type") - config_class = CONFIG_MAPPING[backbone_model_type] - backbone_config = config_class.from_dict(backbone_config) - self.use_timm_backbone = use_timm_backbone + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config - self.num_channels = num_channels self.num_queries = num_queries self.d_model = d_model self.encoder_ffn_dim = encoder_ffn_dim @@ -352,9 +326,6 @@ def __init__( self.activation_function = activation_function self.auxiliary_loss = auxiliary_loss self.position_embedding_type = position_embedding_type - self.backbone = backbone - self.use_pretrained_backbone = use_pretrained_backbone - self.dilation = dilation # deformable attributes self.num_feature_levels = num_feature_levels self.encoder_n_points = encoder_n_points diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 4a2c5eb5e21e7d..883540be9c8a03 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -29,6 +29,7 @@ GroundingDINOForObjectDetection, GroundingDINOImageProcessor, GroundingDINOProcessor, + SwinConfig, ) @@ -37,8 +38,6 @@ def get_grounding_dino_config(model_name): - config = GroundingDINOConfig() - if "tiny" in model_name: window_size = 7 embed_dim = 96 @@ -54,12 +53,16 @@ def get_grounding_dino_config(model_name): else: raise ValueError("Model not supported, only supports base and large variants") - config.backbone_config.window_size = window_size - config.backbone_config.image_size = image_size - config.backbone_config.embed_dim = embed_dim - config.backbone_config.depths = depths - config.backbone_config.num_heads = num_heads - config.backbone_config.out_indices = [2, 3, 4] + backbone_config = SwinConfig( + window_size=window_size, + image_size=image_size, + embed_dim=embed_dim, + depths=depths, + num_heads=num_heads, + out_indices=[2, 3, 4], + ) + + config = GroundingDINOConfig(backbone_config=backbone_config) return config diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index a7cfad6db54e0e..5abd1a8685b809 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -33,7 +33,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_scipy_available, - is_timm_available, is_torch_cuda_available, is_vision_available, replace_return_docstrings, @@ -120,9 +119,6 @@ def backward(context, grad_output): if is_scipy_available(): from scipy.optimize import linear_sum_assignment -if is_timm_available(): - from timm import create_model - logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "GroundingDINOConfig" @@ -422,58 +418,34 @@ def replace_batch_norm(model): replace_batch_norm(module) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvEncoder with DeformableDetr->GroundingDINO class GroundingDINOConvEncoder(nn.Module): """ - Convolutional backbone, using either the AutoBackbone API or one from the timm library. + Convolutional backbone using the AutoBackbone API. nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above. - """ def __init__(self, config): super().__init__() self.config = config - - if config.use_timm_backbone: - requires_backends(self, ["timm"]) - kwargs = {} - if config.dilation: - kwargs["output_stride"] = 16 - backbone = create_model( - config.backbone, - pretrained=config.use_pretrained_backbone, - features_only=True, - out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,), - in_chans=config.num_channels, - **kwargs, - ) - else: - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = AutoBackbone.from_config(config.backbone_config) # replace batch norm by frozen batch norm with torch.no_grad(): replace_batch_norm(backbone) self.model = backbone - self.intermediate_channel_sizes = ( - self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels - ) + self.intermediate_channel_sizes = self.model.channels - backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type + backbone_model_type = config.backbone_config.model_type if "resnet" in backbone_model_type: for name, parameter in self.model.named_parameters(): - if config.use_timm_backbone: - if "layer2" not in name and "layer3" not in name and "layer4" not in name: - parameter.requires_grad_(False) - else: - if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: - parameter.requires_grad_(False) + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) - # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDINO def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): # send pixel_values through the model to get list of feature maps - features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + features = self.model(pixel_values).feature_maps out = [] for feature_map in features: From e48d4118905b4ae652b3a7c635a4d7b1ffc8f453 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 13:44:31 +0100 Subject: [PATCH 129/252] Reorder imports --- .../grounding_dino/modeling_grounding_dino.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 5abd1a8685b809..7a51183dca5ccc 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -66,6 +66,19 @@ if is_vision_available(): from transformers.image_transforms import center_to_corners_format +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "GroundingDINOConfig" +_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny" + +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "idea-research/grounding-dino-tiny", + # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino +] + # Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction class MultiScaleDeformableAttentionFunction(Function): @@ -116,20 +129,6 @@ def backward(context, grad_output): return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "GroundingDINOConfig" -_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny" - -GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "idea-research/grounding-dino-tiny", - # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino -] - - @dataclass class GroundingDINODecoderOutput(ModelOutput): """ From a7f026f9af0a8dd8359ef23773c5e926945f8256 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 17:29:57 +0100 Subject: [PATCH 130/252] More improvements --- docs/source/en/tasks/object_detection.md | 2 +- src/transformers/models/auto/modeling_auto.py | 1 - .../models/grounding_dino/__init__.py | 20 ++++++++++++++++--- .../configuration_grounding_dino.py | 11 +++------- .../image_processing_grounding_dino.py | 18 ++++++++--------- utils/check_repo.py | 1 + 6 files changed, 31 insertions(+), 22 deletions(-) diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 58ec02e80cadf7..7511ee66dd0b99 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -37,7 +37,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Grounding DINO](../model_doc/grounding-dino), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 45669e3ad8b4ac..5084482515a597 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -632,7 +632,6 @@ ("deformable_detr", "DeformableDetrForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), - ("grounding-dino", "GroundingDINOForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), ("yolos", "YolosForObjectDetection"), ] diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index 83622a84513843..67ffc2becc52c1 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -14,7 +14,7 @@ from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available _import_structure = { @@ -23,7 +23,6 @@ "GroundingDINOConfig", "GroundingDINOTextPrenetConfig", ], - "image_processing_grounding_dino": ["GroundingDINOImageProcessor"], "processing_grounding_dino": ["GroundingDINOProcessor"], } @@ -40,6 +39,14 @@ "GroundingDINOPreTrainedModel", ] +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_grounding_dino"] = ["GroundingDINOImageProcessor"] + if TYPE_CHECKING: from .configuration_grounding_dino import ( @@ -47,7 +54,6 @@ GroundingDINOConfig, GroundingDINOTextPrenetConfig, ) - from .image_processing_grounding_dino import GroundingDINOImageProcessor from .processing_grounding_dino import GroundingDINOProcessor try: @@ -63,6 +69,14 @@ GroundingDINOPreTrainedModel, ) + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_grounding_dino import GroundingDINOImageProcessor + else: import sys diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 8bf480e7d99705..50bffc63377d6f 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 SenseTime and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 IDEA Research and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -72,9 +72,6 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -109,7 +106,6 @@ def __init__( layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", - use_cache=True, init_std=0.02, **kwargs, ): @@ -127,7 +123,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type - self.use_cache = use_cache self.init_std = init_std @classmethod @@ -162,8 +157,8 @@ class GroundingDINOConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): The configuration of the backbone model. - text_backbone_config (`str`, *optional*, defaults to `"bert-base-uncased"`): - The configuration of the text backbone model. Should be a bert-like config. + text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextPrenetConfig()`): + The configuration of the text backbone model. Should be a BERT-like config. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects [`GroundingDINOModel`] can detect in a single image. diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index f415e5e1f4a57b..b1c92686fdde95 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -286,7 +286,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar return masks -# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDINO def prepare_coco_detection_annotation( image, target, @@ -294,7 +294,7 @@ def prepare_coco_detection_annotation( input_data_format: Optional[Union[ChannelDimension, str]] = None, ): """ - Convert the target in COCO format into the format expected by DeformableDetr. + Convert the target in COCO format into the format expected by GroundingDINO. """ image_height, image_width = get_image_size(image, channel_dim=input_data_format) @@ -379,7 +379,7 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray: return np.stack([x_min, y_min, x_max, y_max], 1) -# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDINO def prepare_coco_panoptic_annotation( image: np.ndarray, target: Dict, @@ -388,7 +388,7 @@ def prepare_coco_panoptic_annotation( input_data_format: Union[ChannelDimension, str] = None, ) -> Dict: """ - Prepare a coco panoptic annotation for DeformableDetr. + Prepare a coco panoptic annotation for GroundingDINO. """ image_height, image_width = get_image_size(image, channel_dim=input_data_format) annotation_path = pathlib.Path(masks_path) / target["file_name"] @@ -839,11 +839,11 @@ def __init__( self.do_pad = do_pad @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600, + created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)` """ image_processor_dict = image_processor_dict.copy() @@ -853,7 +853,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDINO def prepare_annotation( self, image: np.ndarray, @@ -864,7 +864,7 @@ def prepare_annotation( input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> Dict: """ - Prepare an annotation for feeding into DeformableDetr model. + Prepare an annotation for feeding into GroundingDINO model. """ format = format if format is not None else self.format diff --git a/utils/check_repo.py b/utils/check_repo.py index 95ab142fa0b7f9..cbd2a1781f9966 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -175,6 +175,7 @@ "CLIPSegTextModel", "EsmForProteinFolding", "GPTSanJapaneseModel", + "GroundingDINOForObjectDetection", "TimeSeriesTransformerForPrediction", "InformerForPrediction", "AutoformerForPrediction", From 1930b2ac1ffb57b2eedfa13bc3edf48028683d7a Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 19:32:49 +0100 Subject: [PATCH 131/252] Add Grounding DINO to pipeline --- src/transformers/models/auto/modeling_auto.py | 1 + .../configuration_grounding_dino.py | 1 + ...st_pipelines_zero_shot_object_detection.py | 25 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5084482515a597..cec27eab921110 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -640,6 +640,7 @@ MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( [ # Model for Zero Shot Object Detection mapping + ("grounding-dino", "GroundingDINOForObjectDetection"), ("owlv2", "Owlv2ForObjectDetection"), ("owlvit", "OwlViTForObjectDetection"), ] diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 50bffc63377d6f..474dbb44012b8f 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -236,6 +236,7 @@ class GroundingDINOConfig(PretrainedConfig): The temperature for Sine Positional Embedding that is used together with vision backbone. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + Examples: ```python diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py index c8b424483fa20e..b03ef4285e3115 100644 --- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py +++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py @@ -227,3 +227,28 @@ def test_top_k(self): {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}}, ], ) + + @require_torch + @slow + def test_grounding_dino(self): + object_detector = pipeline("zero-shot-object-detection", model="EduardoPacheco/grounding-dino-tiny") + + outputs = object_detector( + "http://images.cocodataset.org/val2017/000000039769.jpg", + candidate_labels=["a cat."], + ) + + self.assertEqual( + nested_simplify(outputs, decimals=4), + [ + {"score": 0.4526, "label": "a cat.", "box": {"xmin": 344, "ymin": 23, "xmax": 637, "ymax": 373}}, + {"score": 0.4082, "label": "a cat.", "box": {"xmin": 11, "ymin": 51, "xmax": 316, "ymax": 472}}, + {"score": 0.1617, "label": "a cat.", "box": {"xmin": 357, "ymin": 37, "xmax": 552, "ymax": 362}}, + {"score": 0.1299, "label": "a cat.", "box": {"xmin": 330, "ymin": 13, "xmax": 635, "ymax": 445}}, + {"score": 0.1279, "label": "a cat.", "box": {"xmin": 25, "ymin": 54, "xmax": 315, "ymax": 366}}, + {"score": 0.1267, "label": "a cat.", "box": {"xmin": 41, "ymin": 59, "xmax": 306, "ymax": 402}}, + {"score": 0.1098, "label": "a cat.", "box": {"xmin": 279, "ymin": 12, "xmax": 636, "ymax": 408}}, + {"score": 0.1063, "label": "a cat.", "box": {"xmin": 353, "ymin": 39, "xmax": 616, "ymax": 297}}, + {"score": 0.1043, "label": "a cat.", "box": {"xmin": 351, "ymin": 26, "xmax": 550, "ymax": 458}}, + ], + ) From 6ac265ca6b0a241c6f23fdbbeec084d4184f5686 Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 13 Nov 2023 19:53:32 +0100 Subject: [PATCH 132/252] Remove model from check_repo --- utils/check_repo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/check_repo.py b/utils/check_repo.py index cbd2a1781f9966..95ab142fa0b7f9 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -175,7 +175,6 @@ "CLIPSegTextModel", "EsmForProteinFolding", "GPTSanJapaneseModel", - "GroundingDINOForObjectDetection", "TimeSeriesTransformerForPrediction", "InformerForPrediction", "AutoformerForPrediction", From 93b8609ace2b0cc82a7c9ce67c3b5f01b42f7491 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 14 Nov 2023 02:26:43 -0300 Subject: [PATCH 133/252] Added grounded post_process to GroundingDINOProcessor --- .../processing_grounding_dino.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 131eb2c600a1cd..066ccbac897bf5 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,13 +16,39 @@ Processor class for Grounding DINO. """ -from typing import List, Optional, Union +from typing import List, Optional, Union, Dict, Tuple + +import torch from ...image_utils import ImageInput from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...utils import TensorType +def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor): + """Get token ids of phrases from posmaps and input_ids. + + Args: + posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): + A boolean tensor of text-thresholded logits related to the detected bounding boxes. + input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): + A tensor of token ids. + + Returns: + _type_: _description_ + """ + left_idx = 0 + right_idx = 255 + + posmaps[:, 0: left_idx + 1] = False + posmaps[:, right_idx:] = False + + token_ids = [] + for posmap in posmaps: + non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist() + token_ids.append([input_ids[i] for i in non_zero_idx]) + + return token_ids class GroundingDINOProcessor(ProcessorMixin): r""" @@ -149,3 +175,21 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + def post_process_grounded_object_detection(self, outputs, input_ids, box_threshold: float, text_threshold: float, target_sizes: Union[TensorType, List[Tuple]] = None): + """ + Post-process the output of the model to get the grounded object detection results. + """ + results = self.image_processor.post_process_object_detection(outputs, box_threshold, target_sizes) + + probs = torch.sigmoid(outputs.logits) # (batch_size, num_queries, 256) + + for idx, (result, prob) in enumerate(zip(results, probs)): + labels = result["labels"] + # Assuming that selected bboxes are sorted by confidence due to Hungarian matching loss in training + prob = prob[:len(labels)] # len(labels) , 256 + token_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx]) + # overrides result labels key + result["labels"] = self.batch_decode(token_ids) + + return results \ No newline at end of file From 6461389afe4add98460f9581cb3d41bd5972d74a Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 14 Nov 2023 02:43:28 -0300 Subject: [PATCH 134/252] Fixed style --- .../processing_grounding_dino.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 066ccbac897bf5..695babc3034995 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,7 +16,7 @@ Processor class for Grounding DINO. """ -from typing import List, Optional, Union, Dict, Tuple +from typing import List, Optional, Tuple, Union import torch @@ -25,13 +25,14 @@ from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...utils import TensorType + def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor): """Get token ids of phrases from posmaps and input_ids. Args: - posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): + posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): A boolean tensor of text-thresholded logits related to the detected bounding boxes. - input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): + input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): A tensor of token ids. Returns: @@ -40,7 +41,7 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens left_idx = 0 right_idx = 255 - posmaps[:, 0: left_idx + 1] = False + posmaps[:, 0 : left_idx + 1] = False posmaps[:, right_idx:] = False token_ids = [] @@ -50,6 +51,7 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens return token_ids + class GroundingDINOProcessor(ProcessorMixin): r""" Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a @@ -175,21 +177,28 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - def post_process_grounded_object_detection(self, outputs, input_ids, box_threshold: float, text_threshold: float, target_sizes: Union[TensorType, List[Tuple]] = None): + + def post_process_grounded_object_detection( + self, + outputs, + input_ids, + box_threshold: float, + text_threshold: float, + target_sizes: Union[TensorType, List[Tuple]] = None, + ): """ Post-process the output of the model to get the grounded object detection results. """ results = self.image_processor.post_process_object_detection(outputs, box_threshold, target_sizes) - probs = torch.sigmoid(outputs.logits) # (batch_size, num_queries, 256) + probs = torch.sigmoid(outputs.logits) # (batch_size, num_queries, 256) for idx, (result, prob) in enumerate(zip(results, probs)): labels = result["labels"] # Assuming that selected bboxes are sorted by confidence due to Hungarian matching loss in training - prob = prob[:len(labels)] # len(labels) , 256 + prob = prob[: len(labels)] # len(labels) , 256 token_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx]) # overrides result labels key result["labels"] = self.batch_decode(token_ids) - return results \ No newline at end of file + return results From e35f1c97191b117e5fdf40ad293d4a07b4925e63 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Tue, 14 Nov 2023 09:52:35 -0300 Subject: [PATCH 135/252] Fixed GroundingDINOTextPrenetConfig docstrings --- .../grounding_dino/configuration_grounding_dino.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 474dbb44012b8f..efdb550e8374bc 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -78,13 +78,16 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): Examples: ```python - >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOTextPrenetModel + >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOConfig, GroundingDINOForObjectDetection >>> # Initializing a BERT bert-base-uncased style configuration >>> configuration = GroundingDINOTextPrenetConfig() - >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration - >>> model = GroundingDINOTextPrenetModel(configuration) + >>> # Initializing a GroundingDINOConfig with generated bert-like config + >>> config = GroundingDINOConfig(text_backbone_config=configuration) + + >>> # Initializing a model from the ground-up with a config + >>> model = GroundingDINOForObjectDetection(config) >>> # Accessing the model configuration >>> configuration = model.config From 695ffa5a00458975d8e99e08b84eb0adcf08a4ab Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 16 Nov 2023 14:23:46 -0300 Subject: [PATCH 136/252] Aligned inputs.keys() when both image and text are passed with model_input_names --- .../models/grounding_dino/processing_grounding_dino.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 695babc3034995..fa8a09b8e36c6e 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -151,7 +151,9 @@ def __call__( text_encoding = None if text_encoding is not None: - encoding_image_processor.update(text_encoding) + # Keeping same order of model_input_names when both images and text + text_encoding.update(encoding_image_processor) + encoding_image_processor = text_encoding return encoding_image_processor From 7d16d7f684acc95380f8a94703d6c81e055ebafc Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 16 Nov 2023 14:24:18 -0300 Subject: [PATCH 137/252] Added tests for GroundingDINOImageProcessor and GroundingDINOProcessor --- .../test_image_processing_grounding_dino.py | 202 +++++++++++++++++ .../test_processor_grounding_dino.py | 212 ++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 tests/models/grounding_dino/test_image_processing_grounding_dino.py create mode 100644 tests/models/grounding_dino/test_processor_grounding_dino.py diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py new file mode 100644 index 00000000000000..17bbc140de2fc3 --- /dev/null +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import unittest + +from transformers.testing_utils import require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import GroundingDINOImageProcessor + + +class GroundingDINOImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_factor=1 / 255, + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_pad = do_pad + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_pad": self.do_pad, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to GroundingDINOImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = GroundingDINOImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = self.image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + # encode them + image_processing = GroundingDINOImageProcessor() + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py new file mode 100644 index 00000000000000..9231cd8f167350 --- /dev/null +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -0,0 +1,212 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +import numpy as np +import pytest + +from transformers import BertTokenizer, BertTokenizerFast +from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES +from transformers.testing_utils import require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available + + +if is_vision_available(): + from PIL import Image + + from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor + + +@require_vision +class GroundingDINOProcessorTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + image_processor_map = { + "do_resize": True, + "size": None, + "do_normalize": True, + "image_mean": [0.5, 0.5, 0.5], + "image_std": [0.5, 0.5, 0.5], + "do_rescale": True, + "rescale_factor": 1 / 255, + "do_pad": True, + } + self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(self.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + def get_tokenizer(self, **kwargs): + return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_rust_tokenizer(self, **kwargs): + return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + + def get_image_processor(self, **kwargs): + return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def prepare_image_inputs(self): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + """ + + image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + + return image_inputs + + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + processor_slow = GroundingDINOProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(self.tmpdirname) + processor_slow = GroundingDINOProcessor.from_pretrained(self.tmpdirname, use_fast=False) + + processor_fast = GroundingDINOProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(self.tmpdirname) + processor_fast = GroundingDINOProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) + self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor) + self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor) + + def test_save_load_pretrained_additional_features(self): + processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = GroundingDINOProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, BertTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_image_proc = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_image_proc.keys(): + self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual( + list(inputs.keys()), ["pixel_values", "pixel_mask", "input_ids", "token_type_ids", "attention_mask"] + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) From 98321e30286c35483d4f581a44ca228eadfb3bbb Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Thu, 16 Nov 2023 14:25:36 -0300 Subject: [PATCH 138/252] Testing post_process_grounded_object_detection from GroundingDINOProcessor at test_inference_object_detection_head --- .../grounding_dino/test_modeling_grounding_dino.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 220f1a6231ec9c..cb2af2109cdaef 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -727,6 +727,20 @@ def test_inference_object_detection_head(self): self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2)) + # verify grounded postprocessing + expected_labels = ["a cat", "a cat"] + results = processor.post_process_grounded_object_detection( + outputs=outputs, + input_ids=encoding.input_ids, + box_threshold=0.35, + text_threshold=0.3, + target_sizes=[image.size[::-1]], + )[0] + + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2)) + self.assertListEqual(results["labels"], expected_labels) + @require_torch_gpu def test_inference_object_detection_head_equivalence_cpu_gpu(self): processor = self.default_processor From 3da62df955f2be581f76bef06920554717982ab8 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 17 Nov 2023 21:52:45 -0300 Subject: [PATCH 139/252] Fixed order --- tests/models/grounding_dino/test_processor_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 9231cd8f167350..cc7e5b9ba31c5d 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -178,7 +178,7 @@ def test_processor(self): inputs = processor(text=input_str, images=image_input) self.assertListEqual( - list(inputs.keys()), ["pixel_values", "pixel_mask", "input_ids", "token_type_ids", "attention_mask"] + list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"] ) # test if it raises when no input is passed From 6be9a6801a4bf7069e93e9d6562e2d87e208a3d9 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 18 Nov 2023 16:55:30 -0300 Subject: [PATCH 140/252] Marked test with require_torch --- .../grounding_dino/test_processor_grounding_dino.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index cc7e5b9ba31c5d..2d7ddfb95e395b 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -23,16 +23,19 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from transformers.testing_utils import require_vision, require_torch +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available, is_torch_available +if is_torch_available(): + import torch + if is_vision_available(): from PIL import Image from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor - +@require_torch @require_vision class GroundingDINOProcessorTest(unittest.TestCase): def setUp(self): From cc1ee6078223c9b78825d8820b76cdd4c436c0d7 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 18 Nov 2023 16:59:26 -0300 Subject: [PATCH 141/252] Temporarily changed repo_id --- .../grounding_dino/configuration_grounding_dino.py | 8 ++++---- .../models/grounding_dino/modeling_grounding_dino.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index efdb550e8374bc..96cf21765f23c7 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "idea-research/grounding-dino-tiny": "https://huggingface.co/idea-research/grg-dino-tiny/resolve/main/config.json", + "EduardoPacheco/grounding-dino-tiny": "https://huggingface.co/EduardoPacheco/grounding-dino-tiny/resolve/main/config.json", } @@ -152,7 +152,7 @@ class GroundingDINOConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Grounding DINO - [idea-research/grounding-dino-tiny](https://huggingface.co/idea-research/grounding-dino-tiny) architecture. + [EduardoPacheco/grounding-dino-tiny](https://huggingface.co/EduardoPacheco/grounding-dino-tiny) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -245,10 +245,10 @@ class GroundingDINOConfig(PretrainedConfig): ```python >>> from transformers import GroundingDINOConfig, GroundingDINOModel - >>> # Initializing a Grounding DINO idea-research/grounding-dino-tiny style configuration + >>> # Initializing a Grounding DINO EduardoPacheco/grounding-dino-tiny style configuration >>> configuration = GroundingDINOConfig() - >>> # Initializing a model (with random weights) from the idea-research/grounding-dino-tiny style configuration + >>> # Initializing a model (with random weights) from the EduardoPacheco/grounding-dino-tiny style configuration >>> model = GroundingDINOModel(configuration) >>> # Accessing the model configuration diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 7a51183dca5ccc..19a9e343933d90 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -72,10 +72,10 @@ logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "GroundingDINOConfig" -_CHECKPOINT_FOR_DOC = "idea-research/grounding-dino-tiny" +_CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny" GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "idea-research/grounding-dino-tiny", + "EduardoPacheco/grounding-dino-tiny", # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino ] @@ -2207,8 +2207,8 @@ def forward( >>> image = Image.open(requests.get(url, stream=True).raw) >>> text = "a cat." - >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny") - >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny") + >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) @@ -2496,8 +2496,8 @@ def forward( >>> image = Image.open(requests.get(url, stream=True).raw) >>> text = "a cat." - >>> processor = AutoProcessor.from_pretrained("idea-research/grounding-dino-tiny") - >>> model = GroundingDINOForObjectDetection.from_pretrained("idea-research/grounding-dino-tiny") + >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) From 8cf167eb85f7cac2ef6a3185867adf9f0078d091 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Sat, 18 Nov 2023 17:55:21 -0300 Subject: [PATCH 142/252] More improvements --- .../models/grounding_dino/processing_grounding_dino.py | 3 --- .../grounding_dino/test_processor_grounding_dino.py | 9 +++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index fa8a09b8e36c6e..fbf619a271a768 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -34,9 +34,6 @@ def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTens A boolean tensor of text-thresholded logits related to the detected bounding boxes. input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): A tensor of token ids. - - Returns: - _type_: _description_ """ left_idx = 0 right_idx = 255 diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 2d7ddfb95e395b..411110de90b849 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -23,17 +23,18 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision, require_torch -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available, is_torch_available +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available if is_torch_available(): - import torch + from transformers import GroundingDINOProcessor if is_vision_available(): from PIL import Image - from transformers import GroundingDINOImageProcessor, GroundingDINOProcessor + from transformers import GroundingDINOImageProcessor + @require_torch @require_vision From 2927c130a144868b66dc7538a479022293a8fefe Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 20 Nov 2023 14:39:08 +0100 Subject: [PATCH 143/252] Fix style --- .../configuration_grounding_dino.py | 2 + .../image_processing_grounding_dino.py | 12 +++--- .../grounding_dino/modeling_grounding_dino.py | 37 +++++-------------- .../processing_grounding_dino.py | 5 ++- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 96cf21765f23c7..0f49dc4be95a4d 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -92,6 +92,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + model_type = "grounding-dino-text-prenet" def __init__( @@ -254,6 +255,7 @@ class GroundingDINOConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + model_type = "grounding-dino" attribute_map = { "hidden_size": "d_model", diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index b1c92686fdde95..251289f7add757 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -135,9 +135,9 @@ def get_resize_output_image_size( image size is computed by keeping the aspect ratio of the input image size. Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or `List[int]`): The desired output size. max_size (`int`, *optional*): The maximum allowed output size. @@ -1350,8 +1350,8 @@ def post_process_object_detection( self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None ): """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Args: outputs ([`GroundingDINOObjectDetectionOutput`]): @@ -1389,7 +1389,7 @@ def post_process_object_detection( else: img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) boxes = boxes * scale_fct[:, None, :] results = [] diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 19a9e343933d90..2aeff26ad8ecbf 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -406,10 +406,11 @@ def replace_batch_norm(model): if isinstance(module, nn.BatchNorm2d): new_module = GroundingDINOFrozenBatchNorm2d(module.num_features) - new_module.weight.data.copy_(module.weight) - new_module.bias.data.copy_(module.bias) - new_module.running_mean.data.copy_(module.running_mean) - new_module.running_var.data.copy_(module.running_var) + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) model._modules[name] = new_module @@ -476,21 +477,6 @@ def forward(self, pixel_values, pixel_mask): return out, pos -# Copied from transformers.models.detr.modeling_detr._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None): - """ - Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`. - """ - batch_size, source_len = mask.size() - target_len = target_len if target_len is not None else source_len - - expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) - - class GroundingDINOSinePositionEmbedding(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you @@ -3516,20 +3502,15 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs, past_key_value, output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, + past_key_value, + output_attentions, ) else: layer_outputs = layer_module( diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index fbf619a271a768..00c1e864faf06a 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -64,6 +64,7 @@ class GroundingDINOProcessor(ProcessorMixin): tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ + attributes = ["image_processor", "tokenizer"] image_processor_class = "GroundingDINOImageProcessor" tokenizer_class = "AutoTokenizer" @@ -165,8 +166,8 @@ def batch_decode(self, *args, **kwargs): # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer def decode(self, *args, **kwargs): """ - This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer - to the docstring of this method for more information. + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. """ return self.tokenizer.decode(*args, **kwargs) From 42ee6bcae87cffb4d05352def77dbde27434e92d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Mon, 20 Nov 2023 10:41:39 -0300 Subject: [PATCH 144/252] Final improvements --- .../configuration_grounding_dino.py | 2 + .../image_processing_grounding_dino.py | 12 ++--- .../grounding_dino/modeling_grounding_dino.py | 37 ++++--------- .../processing_grounding_dino.py | 5 +- .../test_image_processing_grounding_dino.py | 52 +++++++++++++++++++ .../test_processor_grounding_dino.py | 11 ++++ 6 files changed, 83 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 96cf21765f23c7..0f49dc4be95a4d 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -92,6 +92,7 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + model_type = "grounding-dino-text-prenet" def __init__( @@ -254,6 +255,7 @@ class GroundingDINOConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + model_type = "grounding-dino" attribute_map = { "hidden_size": "d_model", diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index b1c92686fdde95..251289f7add757 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -135,9 +135,9 @@ def get_resize_output_image_size( image size is computed by keeping the aspect ratio of the input image size. Args: - image_size (`Tuple[int, int]`): - The input image size. - size (`int`): + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or `List[int]`): The desired output size. max_size (`int`, *optional*): The maximum allowed output size. @@ -1350,8 +1350,8 @@ def post_process_object_detection( self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None ): """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. + Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Args: outputs ([`GroundingDINOObjectDetectionOutput`]): @@ -1389,7 +1389,7 @@ def post_process_object_detection( else: img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) boxes = boxes * scale_fct[:, None, :] results = [] diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 19a9e343933d90..2aeff26ad8ecbf 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -406,10 +406,11 @@ def replace_batch_norm(model): if isinstance(module, nn.BatchNorm2d): new_module = GroundingDINOFrozenBatchNorm2d(module.num_features) - new_module.weight.data.copy_(module.weight) - new_module.bias.data.copy_(module.bias) - new_module.running_mean.data.copy_(module.running_mean) - new_module.running_var.data.copy_(module.running_var) + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) model._modules[name] = new_module @@ -476,21 +477,6 @@ def forward(self, pixel_values, pixel_mask): return out, pos -# Copied from transformers.models.detr.modeling_detr._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None): - """ - Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`. - """ - batch_size, source_len = mask.size() - target_len = target_len if target_len is not None else source_len - - expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) - - class GroundingDINOSinePositionEmbedding(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you @@ -3516,20 +3502,15 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs, past_key_value, output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, + past_key_value, + output_attentions, ) else: layer_outputs = layer_module( diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index fbf619a271a768..00c1e864faf06a 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -64,6 +64,7 @@ class GroundingDINOProcessor(ProcessorMixin): tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ + attributes = ["image_processor", "tokenizer"] image_processor_class = "GroundingDINOImageProcessor" tokenizer_class = "AutoTokenizer" @@ -165,8 +166,8 @@ def batch_decode(self, *args, **kwargs): # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer def decode(self, *args, **kwargs): """ - This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer - to the docstring of this method for more information. + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. """ return self.tokenizer.decode(*args, **kwargs) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 17bbc140de2fc3..3c24b9cedd4340 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -15,6 +15,7 @@ import json +import pathlib import unittest from transformers.testing_utils import require_torch, require_vision, slow @@ -32,6 +33,7 @@ from transformers import GroundingDINOImageProcessor +# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDINO class GroundingDINOImageProcessingTester(unittest.TestCase): def __init__( self, @@ -126,6 +128,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_torch @require_vision +# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None @@ -200,3 +203,52 @@ def test_call_pytorch_with_coco_detection_annotations(self): # verify size expected_size = torch.tensor([800, 1066]) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + # encode them + image_processing = GroundingDINOImageProcessor(format="coco_panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify masks + expected_masks_sum = 822873 + self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 411110de90b849..b48350e0099ea7 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -77,18 +77,23 @@ def setUp(self): with open(self.image_processor_file, "w", encoding="utf-8") as fp: json.dump(image_processor_map, fp) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_rust_tokenizer with CLIP->Bert def get_rust_tokenizer(self, **kwargs): return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDINO def get_image_processor(self, **kwargs): return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.tearDown def tearDown(self): shutil.rmtree(self.tmpdirname) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.prepare_image_inputs def prepare_image_inputs(self): """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, or a list of PyTorch tensors if one specifies torchify=True. @@ -100,6 +105,7 @@ def prepare_image_inputs(self): return image_inputs + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() @@ -124,6 +130,7 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor) self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer def test_save_load_pretrained_additional_features(self): processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) processor.save_pretrained(self.tmpdirname) @@ -141,6 +148,7 @@ def test_save_load_pretrained_additional_features(self): self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDINO def test_image_processor(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() @@ -155,6 +163,7 @@ def test_image_processor(self): for key in input_image_proc.keys(): self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDINO def test_tokenizer(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() @@ -189,6 +198,7 @@ def test_processor(self): with pytest.raises(ValueError): processor() + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDINO def test_tokenizer_decode(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() @@ -202,6 +212,7 @@ def test_tokenizer_decode(self): self.assertListEqual(decoded_tok, decoded_processor) + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDINO def test_model_input_names(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() From e2b48b0ed8190971b1a3bd85d8f7eb97f87f2ea7 Mon Sep 17 00:00:00 2001 From: Niels Date: Thu, 23 Nov 2023 13:06:06 +0100 Subject: [PATCH 145/252] Improve annotators --- .../test_image_processing_grounding_dino.py | 2 +- .../grounding_dino/test_processor_grounding_dino.py | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 3c24b9cedd4340..cca1233e6d7bc6 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -126,7 +126,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F ) -@require_torch @require_vision # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): @@ -161,6 +160,7 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.do_pad, False) @slow + @require_torch def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index b48350e0099ea7..7b658d8724dd68 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -21,22 +21,18 @@ import numpy as np import pytest -from transformers import BertTokenizer, BertTokenizerFast +from transformers import BertTokenizer, BertTokenizerFast, GroundingDINOProcessor from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_torch, require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available +from transformers.testing_utils import require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available -if is_torch_available(): - from transformers import GroundingDINOProcessor - if is_vision_available(): from PIL import Image from transformers import GroundingDINOImageProcessor -@require_torch @require_vision class GroundingDINOProcessorTest(unittest.TestCase): def setUp(self): From 5e1f0d97518c7422f1025ea9f2bef893b1bf1e84 Mon Sep 17 00:00:00 2001 From: Niels Date: Thu, 23 Nov 2023 13:40:33 +0100 Subject: [PATCH 146/252] Fix style --- .../grounding_dino/test_image_processing_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index cca1233e6d7bc6..3c24b9cedd4340 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -126,6 +126,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F ) +@require_torch @require_vision # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): @@ -160,7 +161,6 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.do_pad, False) @slow - @require_torch def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") From c9a84403b7ee7d29f11dee80fa6dae9a19d77084 Mon Sep 17 00:00:00 2001 From: Niels Date: Thu, 23 Nov 2023 14:22:35 +0100 Subject: [PATCH 147/252] Add is_torch_available --- .../models/grounding_dino/processing_grounding_dino.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 00c1e864faf06a..1164f6541f5fcd 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -18,12 +18,14 @@ from typing import List, Optional, Tuple, Union -import torch - from ...image_utils import ImageInput from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType +from ...utils import TensorType, is_torch_available + + +if is_torch_available(): + import torch def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor): From f954f4bdb388969b48aa14e9100e45eed85d2b89 Mon Sep 17 00:00:00 2001 From: Niels Date: Thu, 23 Nov 2023 15:06:35 +0100 Subject: [PATCH 148/252] Remove type hints --- .../models/grounding_dino/processing_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 1164f6541f5fcd..ac3d44eaa758b4 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -28,7 +28,7 @@ import torch -def get_phrases_from_posmap(posmaps: torch.BoolTensor, input_ids: torch.LongTensor): +def get_phrases_from_posmap(posmaps, input_ids): """Get token ids of phrases from posmaps and input_ids. Args: From 2eb2a98274ed889176db066df58f17c7a8c525b2 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 13:31:32 -0300 Subject: [PATCH 149/252] vocab_tokens as one liner --- .../test_processor_grounding_dino.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 7b658d8724dd68..13a133c80cc861 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -38,23 +38,7 @@ class GroundingDINOProcessorTest(unittest.TestCase): def setUp(self): self.tmpdirname = tempfile.mkdtemp() - vocab_tokens = [ - "[UNK]", - "[CLS]", - "[SEP]", - "[PAD]", - "[MASK]", - "want", - "##want", - "##ed", - "wa", - "un", - "runn", - "##ing", - ",", - "low", - "lowest", - ] + vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"] # fmt: skip self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) From 625123a087d804b4260fb17d68180f8ee87c34db Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 13:33:30 -0300 Subject: [PATCH 150/252] Removed print statements --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 883540be9c8a03..f70a71c1d6a741 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -358,8 +358,6 @@ def convert_grounding_dino_checkpoint(args): model = GroundingDINOForObjectDetection(config) model.eval() missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) # Load and process test image image = prepare_img() @@ -379,8 +377,6 @@ def convert_grounding_dino_checkpoint(args): with torch.no_grad(): outputs = model(**inputs) - print("First values of logits:", outputs.logits[0, :3, :3]) - print("First values of boxes:", outputs.pred_boxes[0, :3, :3]) # verify outputs expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]) @@ -389,15 +385,12 @@ def convert_grounding_dino_checkpoint(args): ) assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3) - print("Looks ok!") if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) processor.save_pretrained(pytorch_dump_folder_path) if push_to_hub: - print(f"Pushing model and processor for {model_name} to hub") model.push_to_hub(f"EduardoPacheco/{model_name}") processor.push_to_hub(f"EduardoPacheco/{model_name}") From 4553ad1695cc41c6a613c8272c7faa1462f1988d Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 13:36:53 -0300 Subject: [PATCH 151/252] Renamed GroundingDINOTextPrenetConfig to GroundingDINOTextConfig --- docs/source/en/model_doc/grounding-dino.md | 4 ++-- src/transformers/__init__.py | 4 ++-- .../models/grounding_dino/__init__.py | 4 ++-- .../configuration_grounding_dino.py | 16 ++++++++-------- .../grounding_dino/modeling_grounding_dino.py | 4 ++-- .../test_modeling_grounding_dino.py | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index ef41448d3d06ef..bb95255d28014b 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -47,9 +47,9 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding [[autodoc]] GroundingDINOProcessor -## GroundingDINOTextPrenetConfig +## GroundingDINOTextConfig -[[autodoc]] GroundingDINOTextPrenetConfig +[[autodoc]] GroundingDINOTextConfig ## GroundingDINOConfig diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0ec3644d249a18..79b8fd62edaa00 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -374,7 +374,7 @@ "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", "GroundingDINOProcessor", - "GroundingDINOTextPrenetConfig", + "GroundingDINOTextConfig", ], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -4603,7 +4603,7 @@ GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, GroundingDINOProcessor, - GroundingDINOTextPrenetConfig, + GroundingDINOTextConfig, ) from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index 67ffc2becc52c1..b5db32c0f8ae47 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -21,7 +21,7 @@ "configuration_grounding_dino": [ "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GroundingDINOConfig", - "GroundingDINOTextPrenetConfig", + "GroundingDINOTextConfig", ], "processing_grounding_dino": ["GroundingDINOProcessor"], } @@ -52,7 +52,7 @@ from .configuration_grounding_dino import ( GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, GroundingDINOConfig, - GroundingDINOTextPrenetConfig, + GroundingDINOTextConfig, ) from .processing_grounding_dino import GroundingDINOProcessor diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 0f49dc4be95a4d..264ba34faf20b6 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -28,7 +28,7 @@ } -class GroundingDINOTextPrenetConfig(PretrainedConfig): +class GroundingDINOTextConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a @@ -78,10 +78,10 @@ class GroundingDINOTextPrenetConfig(PretrainedConfig): Examples: ```python - >>> from transformers import GroundingDINOTextPrenetConfig, GroundingDINOConfig, GroundingDINOForObjectDetection + >>> from transformers import GroundingDINOTextConfig, GroundingDINOConfig, GroundingDINOForObjectDetection >>> # Initializing a BERT bert-base-uncased style configuration - >>> configuration = GroundingDINOTextPrenetConfig() + >>> configuration = GroundingDINOTextConfig() >>> # Initializing a GroundingDINOConfig with generated bert-like config >>> config = GroundingDINOConfig(text_backbone_config=configuration) @@ -161,7 +161,7 @@ class GroundingDINOConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): The configuration of the backbone model. - text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextPrenetConfig()`): + text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextConfig()`): The configuration of the text backbone model. Should be a BERT-like config. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects @@ -343,14 +343,14 @@ def __init__( self.disable_custom_kernels = disable_custom_kernels # Text backbone if text_backbone_config is None: - self.text_backbone_config = GroundingDINOTextPrenetConfig() + self.text_backbone_config = GroundingDINOTextConfig() elif isinstance(text_backbone_config, dict): - self.text_backbone_config = GroundingDINOTextPrenetConfig(**text_backbone_config) - elif isinstance(text_backbone_config, GroundingDINOTextPrenetConfig): + self.text_backbone_config = GroundingDINOTextConfig(**text_backbone_config) + elif isinstance(text_backbone_config, GroundingDINOTextConfig): self.text_backbone_config = text_backbone_config else: raise ValueError( - f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextPrenetConfig`. Received {type(text_backbone_config)} instead." + f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextConfig`. Received {type(text_backbone_config)} instead." ) self.max_text_len = max_text_len # Text Enhancer diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 2aeff26ad8ecbf..5741acbd7e7d2f 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -46,7 +46,7 @@ from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone -from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextPrenetConfig +from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextConfig from .load_custom import load_cuda_kernels @@ -3572,7 +3572,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): - config_class = GroundingDINOTextPrenetConfig + config_class = GroundingDINOTextConfig def __init__(self, config, add_pooling_layer=True): super().__init__(config) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index cb2af2109cdaef..f8fc49fd3754ea 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -23,7 +23,7 @@ from transformers import ( GroundingDINOConfig, - GroundingDINOTextPrenetConfig, + GroundingDINOTextConfig, SwinConfig, is_torch_available, is_vision_available, @@ -146,7 +146,7 @@ def get_config(self): out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) - text_backbone = GroundingDINOTextPrenetConfig( + text_backbone = GroundingDINOTextConfig( hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8 ) return GroundingDINOConfig( From 3b6b2c2479c543aa3d95c437b487e6ebb52022cb Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 13:38:32 -0300 Subject: [PATCH 152/252] remove unnecessary comments --- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index f70a71c1d6a741..04a7772fef19cb 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -70,7 +70,6 @@ def get_grounding_dino_config(model_name): def create_rename_keys(state_dict, config): rename_keys = [] # fmt: off - #TODO names might change after modifing GroundingDINOModel class ########################################## VISION BACKBONE - START # patch embedding layer rename_keys.append(("backbone.0.patch_embed.proj.weight", From afb26499a15f4d7ea68d271697f5348131811971 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 13:41:58 -0300 Subject: [PATCH 153/252] Removed unnecessary tests on conversion script --- .../grounding_dino/convert_grounding_dino_to_hf.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 04a7772fef19cb..b075e43969dd85 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -374,16 +374,7 @@ def convert_grounding_dino_checkpoint(args): # Running forward with torch.no_grad(): - outputs = model(**inputs) - - - # verify outputs - expected_boxes = torch.tensor([[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]]) - expected_logits = torch.tensor( - [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] - ) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4) - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3) + _ = model(**inputs) if pytorch_dump_folder_path is not None: model.save_pretrained(pytorch_dump_folder_path) From 4fdaf425ac7e8b3b5741bd00bfe0e6952a910900 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 14:34:20 -0300 Subject: [PATCH 154/252] Renamed GroundingDINO to camel case GroundingDino --- docs/source/en/model_doc/grounding-dino.md | 26 +- src/transformers/__init__.py | 28 +- .../models/auto/configuration_auto.py | 2 +- .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 4 +- .../models/grounding_dino/__init__.py | 28 +- .../configuration_grounding_dino.py | 40 +-- .../convert_grounding_dino_to_hf.py | 26 +- .../image_processing_grounding_dino.py | 24 +- .../grounding_dino/modeling_grounding_dino.py | 274 +++++++++--------- .../processing_grounding_dino.py | 8 +- src/transformers/utils/dummy_pt_objects.py | 6 +- .../utils/dummy_vision_objects.py | 2 +- .../test_image_processing_grounding_dino.py | 20 +- .../test_modeling_grounding_dino.py | 44 +-- .../test_processor_grounding_dino.py | 50 ++-- utils/check_repo.py | 2 +- 17 files changed, 293 insertions(+), 293 deletions(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index bb95255d28014b..f3ccc78ad5c876 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -26,7 +26,7 @@ The abstract from the paper is the following: Tips: -- One can use [`GroundingDINOProcessor`] to prepare image-text pairs for the model. +- One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model. drawing @@ -37,30 +37,30 @@ This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPac The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO). -## GroundingDINOImageProcessor +## GroundingDinoImageProcessor -[[autodoc]] GroundingDINOImageProcessor +[[autodoc]] GroundingDinoImageProcessor - preprocess - post_process_object_detection -## GroundingDINOProcessor +## GroundingDinoProcessor -[[autodoc]] GroundingDINOProcessor +[[autodoc]] GroundingDinoProcessor -## GroundingDINOTextConfig +## GroundingDinoTextConfig -[[autodoc]] GroundingDINOTextConfig +[[autodoc]] GroundingDinoTextConfig -## GroundingDINOConfig +## GroundingDinoConfig -[[autodoc]] GroundingDINOConfig +[[autodoc]] GroundingDinoConfig -## GroundingDINOModel +## GroundingDinoModel -[[autodoc]] GroundingDINOModel +[[autodoc]] GroundingDinoModel - forward -## GroundingDINOForObjectDetection +## GroundingDinoForObjectDetection -[[autodoc]] GroundingDINOForObjectDetection +[[autodoc]] GroundingDinoForObjectDetection - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 79b8fd62edaa00..ac41ec69ca30b6 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -372,9 +372,9 @@ "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], "models.grounding_dino": [ "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", - "GroundingDINOConfig", - "GroundingDINOProcessor", - "GroundingDINOTextConfig", + "GroundingDinoConfig", + "GroundingDinoProcessor", + "GroundingDinoTextConfig", ], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -994,7 +994,7 @@ _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) - _import_structure["models.grounding_dino"].extend(["GroundingDINOImageProcessor"]) + _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"]) _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) @@ -2001,9 +2001,9 @@ _import_structure["models.grounding_dino"].extend( [ "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", - "GroundingDINOForObjectDetection", - "GroundingDINOModel", - "GroundingDINOPreTrainedModel", + "GroundingDinoForObjectDetection", + "GroundingDinoModel", + "GroundingDinoPreTrainedModel", ] ) _import_structure["models.groupvit"].extend( @@ -4601,9 +4601,9 @@ from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig from .models.grounding_dino import ( GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, - GroundingDINOConfig, - GroundingDINOProcessor, - GroundingDINOTextConfig, + GroundingDinoConfig, + GroundingDinoProcessor, + GroundingDinoTextConfig, ) from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -5164,7 +5164,7 @@ from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor from .models.fuyu import FuyuImageProcessor, FuyuProcessor from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor - from .models.grounding_dino import GroundingDINOImageProcessor + from .models.grounding_dino import GroundingDinoImageProcessor from .models.idefics import IdeficsImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor @@ -6010,9 +6010,9 @@ ) from .models.grounding_dino import ( GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, - GroundingDINOForObjectDetection, - GroundingDINOModel, - GroundingDINOPreTrainedModel, + GroundingDinoForObjectDetection, + GroundingDinoModel, + GroundingDinoPreTrainedModel, ) from .models.groupvit import ( GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 5ededa2c191d09..01060b3dc3f31b 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -110,7 +110,7 @@ ("gptj", "GPTJConfig"), ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), - ("grounding-dino", "GroundingDINOConfig"), + ("grounding-dino", "GroundingDinoConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 99a8cc0387b18a..da4c75f558060f 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -67,7 +67,7 @@ ("fuyu", "FuyuImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), - ("grounding-dino", "GroundingDINOImageProcessor"), + ("grounding-dino", "GroundingDinoImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 27c9b4ce094424..c64e2ab050ee51 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -106,7 +106,7 @@ ("gptj", "GPTJModel"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), - ("grounding-dino", "GroundingDINOModel"), + ("grounding-dino", "GroundingDinoModel"), ("groupvit", "GroupViTModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), @@ -648,7 +648,7 @@ MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( [ # Model for Zero Shot Object Detection mapping - ("grounding-dino", "GroundingDINOForObjectDetection"), + ("grounding-dino", "GroundingDinoForObjectDetection"), ("owlv2", "Owlv2ForObjectDetection"), ("owlvit", "OwlViTForObjectDetection"), ] diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py index b5db32c0f8ae47..6dfe21cf83d5e0 100644 --- a/src/transformers/models/grounding_dino/__init__.py +++ b/src/transformers/models/grounding_dino/__init__.py @@ -20,10 +20,10 @@ _import_structure = { "configuration_grounding_dino": [ "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", - "GroundingDINOConfig", - "GroundingDINOTextConfig", + "GroundingDinoConfig", + "GroundingDinoTextConfig", ], - "processing_grounding_dino": ["GroundingDINOProcessor"], + "processing_grounding_dino": ["GroundingDinoProcessor"], } try: @@ -34,9 +34,9 @@ else: _import_structure["modeling_grounding_dino"] = [ "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", - "GroundingDINOForObjectDetection", - "GroundingDINOModel", - "GroundingDINOPreTrainedModel", + "GroundingDinoForObjectDetection", + "GroundingDinoModel", + "GroundingDinoPreTrainedModel", ] try: @@ -45,16 +45,16 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["image_processing_grounding_dino"] = ["GroundingDINOImageProcessor"] + _import_structure["image_processing_grounding_dino"] = ["GroundingDinoImageProcessor"] if TYPE_CHECKING: from .configuration_grounding_dino import ( GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, - GroundingDINOConfig, - GroundingDINOTextConfig, + GroundingDinoConfig, + GroundingDinoTextConfig, ) - from .processing_grounding_dino import GroundingDINOProcessor + from .processing_grounding_dino import GroundingDinoProcessor try: if not is_torch_available(): @@ -64,9 +64,9 @@ else: from .modeling_grounding_dino import ( GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, - GroundingDINOForObjectDetection, - GroundingDINOModel, - GroundingDINOPreTrainedModel, + GroundingDinoForObjectDetection, + GroundingDinoModel, + GroundingDinoPreTrainedModel, ) try: @@ -75,7 +75,7 @@ except OptionalDependencyNotAvailable: pass else: - from .image_processing_grounding_dino import GroundingDINOImageProcessor + from .image_processing_grounding_dino import GroundingDinoImageProcessor else: import sys diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 264ba34faf20b6..e7091ba2b695d7 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -28,9 +28,9 @@ } -class GroundingDINOTextConfig(PretrainedConfig): +class GroundingDinoTextConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`GroundingDINOTextPrenetModel`]. It is used to + This is the configuration class to store the configuration of a [`GroundingDinoTextPrenetModel`]. It is used to instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. @@ -41,7 +41,7 @@ class GroundingDINOTextConfig(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 30522): Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`GroundingDINOTextPrenetModel`]. + `inputs_ids` passed when calling [`GroundingDinoTextPrenetModel`]. hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -61,7 +61,7 @@ class GroundingDINOTextConfig(PretrainedConfig): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): - The vocabulary size of the `token_type_ids` passed when calling [`GroundingDINOTextPrenetModel`]. + The vocabulary size of the `token_type_ids` passed when calling [`GroundingDinoTextPrenetModel`]. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): @@ -78,16 +78,16 @@ class GroundingDINOTextConfig(PretrainedConfig): Examples: ```python - >>> from transformers import GroundingDINOTextConfig, GroundingDINOConfig, GroundingDINOForObjectDetection + >>> from transformers import GroundingDinoTextConfig, GroundingDinoConfig, GroundingDinoForObjectDetection >>> # Initializing a BERT bert-base-uncased style configuration - >>> configuration = GroundingDINOTextConfig() + >>> configuration = GroundingDinoTextConfig() - >>> # Initializing a GroundingDINOConfig with generated bert-like config - >>> config = GroundingDINOConfig(text_backbone_config=configuration) + >>> # Initializing a GroundingDinoConfig with generated bert-like config + >>> config = GroundingDinoConfig(text_backbone_config=configuration) >>> # Initializing a model from the ground-up with a config - >>> model = GroundingDINOForObjectDetection(config) + >>> model = GroundingDinoForObjectDetection(config) >>> # Accessing the model configuration >>> configuration = model.config @@ -148,9 +148,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return cls.from_dict(config_dict, **kwargs) -class GroundingDINOConfig(PretrainedConfig): +class GroundingDinoConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`GroundingDINOModel`]. It is used to instantiate a + This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Grounding DINO [EduardoPacheco/grounding-dino-tiny](https://huggingface.co/EduardoPacheco/grounding-dino-tiny) architecture. @@ -161,11 +161,11 @@ class GroundingDINOConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): The configuration of the backbone model. - text_backbone_config (`str`, *optional*, defaults to `GroundingDINOTextConfig()`): + text_backbone_config (`str`, *optional*, defaults to `GroundingDinoTextConfig()`): The configuration of the text backbone model. Should be a BERT-like config. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects - [`GroundingDINOModel`] can detect in a single image. + [`GroundingDinoModel`] can detect in a single image. encoder_layers (`int`, *optional*, defaults to 6): Number of encoder layers. encoder_ffn_dim (`int`, *optional*, defaults to 2048): @@ -244,13 +244,13 @@ class GroundingDINOConfig(PretrainedConfig): Examples: ```python - >>> from transformers import GroundingDINOConfig, GroundingDINOModel + >>> from transformers import GroundingDinoConfig, GroundingDinoModel >>> # Initializing a Grounding DINO EduardoPacheco/grounding-dino-tiny style configuration - >>> configuration = GroundingDINOConfig() + >>> configuration = GroundingDinoConfig() >>> # Initializing a model (with random weights) from the EduardoPacheco/grounding-dino-tiny style configuration - >>> model = GroundingDINOModel(configuration) + >>> model = GroundingDinoModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config @@ -343,14 +343,14 @@ def __init__( self.disable_custom_kernels = disable_custom_kernels # Text backbone if text_backbone_config is None: - self.text_backbone_config = GroundingDINOTextConfig() + self.text_backbone_config = GroundingDinoTextConfig() elif isinstance(text_backbone_config, dict): - self.text_backbone_config = GroundingDINOTextConfig(**text_backbone_config) - elif isinstance(text_backbone_config, GroundingDINOTextConfig): + self.text_backbone_config = GroundingDinoTextConfig(**text_backbone_config) + elif isinstance(text_backbone_config, GroundingDinoTextConfig): self.text_backbone_config = text_backbone_config else: raise ValueError( - f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDINOTextConfig`. Received {type(text_backbone_config)} instead." + f"`text_backbone_config` should be either a `dict` or an instance of `GroundingDinoTextConfig`. Received {type(text_backbone_config)} instead." ) self.max_text_len = max_text_len # Text Enhancer diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index b075e43969dd85..066e0a209a0f53 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Convert GroundingDINO SimMIM checkpoints from the original repository. +"""Convert GroundingDino SimMIM checkpoints from the original repository. URL: https://github.com/IDEA-Research/GroundingDINO""" @@ -25,10 +25,10 @@ from transformers import ( AutoTokenizer, - GroundingDINOConfig, - GroundingDINOForObjectDetection, - GroundingDINOImageProcessor, - GroundingDINOProcessor, + GroundingDinoConfig, + GroundingDinoForObjectDetection, + GroundingDinoImageProcessor, + GroundingDinoProcessor, SwinConfig, ) @@ -62,7 +62,7 @@ def get_grounding_dino_config(model_name): out_indices=[2, 3, 4], ) - config = GroundingDINOConfig(backbone_config=backbone_config) + config = GroundingDinoConfig(backbone_config=backbone_config) return config @@ -334,10 +334,10 @@ def convert_grounding_dino_checkpoint(args): push_to_hub = args.push_to_hub checkpoint_mapping = { - "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth", - "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth", + "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", + "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth", } - # Define default GroundingDINO configuation + # Define default GroundingDino configuation config = get_grounding_dino_config(model_name) # Load original checkpoint @@ -354,7 +354,7 @@ def convert_grounding_dino_checkpoint(args): read_in_q_k_v(new_state_dict, config) # Load HF model - model = GroundingDINOForObjectDetection(config) + model = GroundingDinoForObjectDetection(config) model.eval() missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) @@ -363,9 +363,9 @@ def convert_grounding_dino_checkpoint(args): transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) original_pixel_values = transforms(image).unsqueeze(0) - image_processor = GroundingDINOImageProcessor() + image_processor = GroundingDinoImageProcessor() tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - processor = GroundingDINOProcessor(image_processor=image_processor, tokenizer=tokenizer) + processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer) text = "a cat" inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") @@ -393,7 +393,7 @@ def convert_grounding_dino_checkpoint(args): default="grounding-dino-tiny", type=str, choices=["grounding-dino-tiny", "grounding-dino-base"], - help="Name of the GroundingDINO model you'd like to convert.", + help="Name of the GroundingDino model you'd like to convert.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 251289f7add757..d98892922c0024 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -286,7 +286,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar return masks -# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDINO +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino def prepare_coco_detection_annotation( image, target, @@ -294,7 +294,7 @@ def prepare_coco_detection_annotation( input_data_format: Optional[Union[ChannelDimension, str]] = None, ): """ - Convert the target in COCO format into the format expected by GroundingDINO. + Convert the target in COCO format into the format expected by GroundingDino. """ image_height, image_width = get_image_size(image, channel_dim=input_data_format) @@ -379,7 +379,7 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray: return np.stack([x_min, y_min, x_max, y_max], 1) -# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDINO +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino def prepare_coco_panoptic_annotation( image: np.ndarray, target: Dict, @@ -388,7 +388,7 @@ def prepare_coco_panoptic_annotation( input_data_format: Union[ChannelDimension, str] = None, ) -> Dict: """ - Prepare a coco panoptic annotation for GroundingDINO. + Prepare a coco panoptic annotation for GroundingDino. """ image_height, image_width = get_image_size(image, channel_dim=input_data_format) annotation_path = pathlib.Path(masks_path) / target["file_name"] @@ -758,7 +758,7 @@ def compute_segments( return segmentation, segments -class GroundingDINOImageProcessor(BaseImageProcessor): +class GroundingDinoImageProcessor(BaseImageProcessor): r""" Constructs a Grounding DINO image processor. @@ -839,11 +839,11 @@ def __init__( self.do_pad = do_pad @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDINO + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `GroundingDINOImageProcessor.from_pretrained(checkpoint, size=600, + created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)` """ image_processor_dict = image_processor_dict.copy() @@ -853,7 +853,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDINO + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino def prepare_annotation( self, image: np.ndarray, @@ -864,7 +864,7 @@ def prepare_annotation( input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> Dict: """ - Prepare an annotation for feeding into GroundingDINO model. + Prepare an annotation for feeding into GroundingDino model. """ format = format if format is not None else self.format @@ -1345,16 +1345,16 @@ def preprocess( return encoded_inputs - # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDINO + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino def post_process_object_detection( self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None ): """ - Converts the raw output of [`GroundingDINOForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. Args: - outputs ([`GroundingDINOObjectDetectionOutput`]): + outputs ([`GroundingDinoObjectDetectionOutput`]): Raw outputs of the model. threshold (`float`, *optional*): Score threshold to keep object detection predictions. diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 5741acbd7e7d2f..664f549603b6e3 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -46,7 +46,7 @@ from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import is_ninja_available, logging from ..auto import AutoBackbone -from .configuration_grounding_dino import GroundingDINOConfig, GroundingDINOTextConfig +from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig from .load_custom import load_cuda_kernels @@ -71,7 +71,7 @@ logger = logging.get_logger(__name__) -_CONFIG_FOR_DOC = "GroundingDINOConfig" +_CONFIG_FOR_DOC = "GroundingDinoConfig" _CHECKPOINT_FOR_DOC = "EduardoPacheco/grounding-dino-tiny" GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -130,9 +130,9 @@ def backward(context, grad_output): @dataclass -class GroundingDINODecoderOutput(ModelOutput): +class GroundingDinoDecoderOutput(ModelOutput): """ - Base class for outputs of the GroundingDINODecoder. This class adds two attributes to + Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to BaseModelOutputWithCrossAttentions, namely: - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) - a stacked tensor of intermediate reference points. @@ -162,9 +162,9 @@ class GroundingDINODecoderOutput(ModelOutput): @dataclass -class GroundingDINOEncoderOutput(ModelOutput): +class GroundingDinoEncoderOutput(ModelOutput): """ - Base class for outputs of the GroundingDINOEncoder. This class extends BaseModelOutput, due to: + Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to: - vision and text last hidden states - vision and text intermediate hidden states @@ -196,7 +196,7 @@ class GroundingDINOEncoderOutput(ModelOutput): @dataclass -class GroundingDINOModelOutput(ModelOutput): +class GroundingDinoModelOutput(ModelOutput): """ Base class for outputs of the Grounding DINO encoder-decoder model. @@ -259,9 +259,9 @@ class GroundingDINOModelOutput(ModelOutput): @dataclass -class GroundingDINOObjectDetectionOutput(ModelOutput): +class GroundingDinoObjectDetectionOutput(ModelOutput): """ - Output type of [`GroundingDINOForObjectDetection`]. + Output type of [`GroundingDinoForObjectDetection`]. Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): @@ -275,7 +275,7 @@ class GroundingDINOObjectDetectionOutput(ModelOutput): pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding - possible padding). You can use [`~GroundingDINOProcessor.post_process_object_detection`] to retrieve the + possible padding). You can use [`~GroundingDinoProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`List[Dict]`, *optional*): Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) @@ -353,8 +353,8 @@ def inverse_sigmoid(x, eps=1e-5): return torch.log(x1 / x2) -# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDINO -class GroundingDINOFrozenBatchNorm2d(nn.Module): +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino +class GroundingDinoFrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed. @@ -393,10 +393,10 @@ def forward(self, x): return x * scale + bias -# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDINO +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDino def replace_batch_norm(model): r""" - Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDINOFrozenBatchNorm2d`. + Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDinoFrozenBatchNorm2d`. Args: model (torch.nn.Module): @@ -404,7 +404,7 @@ def replace_batch_norm(model): """ for name, module in model.named_children(): if isinstance(module, nn.BatchNorm2d): - new_module = GroundingDINOFrozenBatchNorm2d(module.num_features) + new_module = GroundingDinoFrozenBatchNorm2d(module.num_features) if not module.weight.device == torch.device("meta"): new_module.weight.data.copy_(module.weight) @@ -418,11 +418,11 @@ def replace_batch_norm(model): replace_batch_norm(module) -class GroundingDINOConvEncoder(nn.Module): +class GroundingDinoConvEncoder(nn.Module): """ Convolutional backbone using the AutoBackbone API. - nn.BatchNorm2d layers are replaced by GroundingDINOFrozenBatchNorm2d as defined above. + nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above. """ def __init__(self, config): @@ -455,8 +455,8 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): return out -# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDINO -class GroundingDINOConvModel(nn.Module): +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino +class GroundingDinoConvModel(nn.Module): """ This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. """ @@ -477,7 +477,7 @@ def forward(self, pixel_values, pixel_mask): return out, pos -class GroundingDINOSinePositionEmbedding(nn.Module): +class GroundingDinoSinePositionEmbedding(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. @@ -516,7 +516,7 @@ def forward(self, pixel_values, pixel_mask): # Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding -class GroundingDINOLearnedPositionEmbedding(nn.Module): +class GroundingDinoLearnedPositionEmbedding(nn.Module): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -543,11 +543,11 @@ def build_position_encoding(config): n_steps = config.d_model // 2 if config.position_embedding_type == "sine": # TODO find a better way of exposing other arguments - position_embedding = GroundingDINOSinePositionEmbedding( + position_embedding = GroundingDinoSinePositionEmbedding( n_steps, config.positional_embedding_temperature, normalize=True ) elif config.position_embedding_type == "learned": - position_embedding = GroundingDINOLearnedPositionEmbedding(n_steps) + position_embedding = GroundingDinoLearnedPositionEmbedding(n_steps) else: raise ValueError(f"Not supported {config.position_embedding_type}") @@ -594,13 +594,13 @@ def multi_scale_deformable_attention( return output.transpose(1, 2).contiguous() -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDINO,Deformable DETR->Grounding DINO -class GroundingDINOMultiscaleDeformableAttention(nn.Module): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino,Deformable DETR->Grounding DINO +class GroundingDinoMultiscaleDeformableAttention(nn.Module): """ Multiscale deformable attention as proposed in Deformable DETR. """ - def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): + def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int): super().__init__() if config.d_model % num_heads != 0: raise ValueError( @@ -610,7 +610,7 @@ def __init__(self, config: GroundingDINOConfig, num_heads: int, n_points: int): # check if dim_per_head is power of 2 if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): warnings.warn( - "You'd better set embed_dim (d_model) in GroundingDINOMultiscaleDeformableAttention to make the" + "You'd better set embed_dim (d_model) in GroundingDinoMultiscaleDeformableAttention to make the" " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" " implementation." ) @@ -728,7 +728,7 @@ def forward( return output, attention_weights -class GroundingDINOTextEnhancerLayer(nn.Module): +class GroundingDinoTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" def __init__(self, config): @@ -760,7 +760,7 @@ def forward( position_embeddings: Optional[torch.FloatTensor] = None, ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: """Text self-attention to enhance projection of text features generated by - the text encoder (GroundingDINOTextPrenet) within GroundingDINOEncoderLayer + the text encoder (GroundingDinoTextPrenet) within GroundingDinoEncoderLayer Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): @@ -803,7 +803,7 @@ def forward( return hidden_states, attention_weights -class GroundingDINOBiMultiHeadAttention(nn.Module): +class GroundingDinoBiMultiHeadAttention(nn.Module): def __init__(self, config): super().__init__() @@ -975,8 +975,8 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals return output -# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDINO -class GroundingDINODropPath(nn.Module): +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDino +class GroundingDinoDropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" def __init__(self, drop_prob: Optional[float] = None) -> None: @@ -990,7 +990,7 @@ def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -class GroundingDINOFusionLayer(nn.Module): +class GroundingDinoFusionLayer(nn.Module): def __init__(self, config, init_values=1e-4): super().__init__() drop_path = config.fusion_droppath @@ -998,10 +998,10 @@ def __init__(self, config, init_values=1e-4): # pre layer norm self.layer_norm_vision = nn.LayerNorm(config.d_model) self.layer_norm_text = nn.LayerNorm(config.d_model) - self.attn = GroundingDINOBiMultiHeadAttention(config) + self.attn = GroundingDinoBiMultiHeadAttention(config) # add layer scale for training stability - self.drop_path = GroundingDINODropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) @@ -1053,11 +1053,11 @@ def forward( # NOTE just renamed the class -class GroundingDINODeformableLayer(nn.Module): - def __init__(self, config: GroundingDINOConfig): +class GroundingDinoDeformableLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): super().__init__() self.embed_dim = config.d_model - self.self_attn = GroundingDINOMultiscaleDeformableAttention( + self.self_attn = GroundingDinoMultiscaleDeformableAttention( config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -1165,15 +1165,15 @@ def sine_func(x: torch.Tensor): return pos_res -class GroundingDINOEncoderLayer(nn.Module): +class GroundingDinoEncoderLayer(nn.Module): def __init__(self, config) -> None: super().__init__() self.d_model = config.d_model - self.text_enhancer_layer = GroundingDINOTextEnhancerLayer(config) - self.fusion_layer = GroundingDINOFusionLayer(config) - self.deformable_layer = GroundingDINODeformableLayer(config) + self.text_enhancer_layer = GroundingDinoTextEnhancerLayer(config) + self.fusion_layer = GroundingDinoFusionLayer(config) + self.deformable_layer = GroundingDinoDeformableLayer(config) def get_text_position_embeddings( self, text_features: Tensor, text_position_embedding: Tensor, text_position_ids: Tensor @@ -1240,8 +1240,8 @@ def forward( ) -class GroundingDINODecoderLayer(nn.Module): - def __init__(self, config: GroundingDINOConfig): +class GroundingDinoDecoderLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): super().__init__() self.embed_dim = config.d_model @@ -1266,7 +1266,7 @@ def __init__(self, config: GroundingDINOConfig): ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention - self.encoder_attn = GroundingDINOMultiscaleDeformableAttention( + self.encoder_attn = GroundingDinoMultiscaleDeformableAttention( config, num_heads=config.decoder_attention_heads, n_points=config.decoder_n_points, @@ -1358,7 +1358,7 @@ def forward( return outputs -class GroundingDINOContrastiveEmbedding(nn.Module): +class GroundingDinoContrastiveEmbedding(nn.Module): def __init__(self, config): super().__init__() self.max_text_len = config.max_text_len @@ -1380,7 +1380,7 @@ def forward( # Copied from transformers.models.detr.modeling_detr.DetrClassificationHead -class GroundingDINOClassificationHead(nn.Module): +class GroundingDinoClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): @@ -1398,20 +1398,20 @@ def forward(self, hidden_states: torch.Tensor): return hidden_states -class GroundingDINOPreTrainedModel(PreTrainedModel): - config_class = GroundingDINOConfig +class GroundingDinoPreTrainedModel(PreTrainedModel): + config_class = GroundingDinoConfig base_model_prefix = "model" main_input_name = "pixel_values" def _init_weights(self, module): std = self.config.init_std - if isinstance(module, GroundingDINOLearnedPositionEmbedding): + if isinstance(module, GroundingDinoLearnedPositionEmbedding): nn.init.uniform_(module.row_embeddings.weight) nn.init.uniform_(module.column_embeddings.weight) - elif isinstance(module, GroundingDINOMultiscaleDeformableAttention): + elif isinstance(module, GroundingDinoMultiscaleDeformableAttention): module._reset_parameters() - elif isinstance(module, GroundingDINOBiMultiHeadAttention): + elif isinstance(module, GroundingDinoBiMultiHeadAttention): nn.init.xavier_uniform_(module.vision_proj.weight) module.vision_proj.bias.data.fill_(0) nn.init.xavier_uniform_(module.text_proj.weight) @@ -1424,7 +1424,7 @@ def _init_weights(self, module): module.out_vision_proj.bias.data.fill_(0) nn.init.xavier_uniform_(module.out_text_proj.weight) module.out_text_proj.bias.data.fill_(0) - elif isinstance(module, (GroundingDINOEncoderLayer, GroundingDINODecoderLayer)): + elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)): for p in module.parameters(): if p.dim() > 1: nn.init.normal_(p, mean=0.0, std=std) @@ -1438,7 +1438,7 @@ def _init_weights(self, module): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - elif isinstance(module, GroundingDINOMLPPredictionHead): + elif isinstance(module, GroundingDinoMLPPredictionHead): nn.init.constant_(module.layers[-1].weight.data, 0) nn.init.constant_(module.layers[-1].bias.data, 0) @@ -1449,7 +1449,7 @@ def _init_weights(self, module): nn.init.normal_(module.level_embed) def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, GroundingDINODecoder): + if isinstance(module, GroundingDinoDecoder): module.gradient_checkpointing = value @@ -1463,7 +1463,7 @@ def _set_gradient_checkpointing(self, module, value=False): and behavior. Parameters: - config ([`GroundingDINOConfig`]): + config ([`GroundingDinoConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. @@ -1474,7 +1474,7 @@ def _set_gradient_checkpointing(self, module, value=False): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Padding will be ignored by default should you provide it. - Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDINOImageProcessor.__call__`] for + Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for details. pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): @@ -1489,7 +1489,7 @@ def _set_gradient_checkpointing(self, module, value=False): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDINOTokenizer.__call__`] for details. + Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details. attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: @@ -1522,22 +1522,22 @@ def _set_gradient_checkpointing(self, module, value=False): """ -class GroundingDINOEncoder(GroundingDINOPreTrainedModel): +class GroundingDinoEncoder(GroundingDinoPreTrainedModel): """ Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a - [`GroundingDINOEncoderLayer`]. + [`GroundingDinoEncoderLayer`]. The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. Args: - config: GroundingDINOConfig + config: GroundingDinoConfig """ - def __init__(self, config: GroundingDINOConfig): + def __init__(self, config: GroundingDinoConfig): super().__init__(config) self.dropout = config.dropout - self.layers = nn.ModuleList([GroundingDINOEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)]) # Initialize weights and apply final processing self.post_init() @@ -1681,7 +1681,7 @@ def forward( if not return_dict: enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns] return tuple(v for v in enc_outputs if v is not None) - return GroundingDINOEncoderOutput( + return GroundingDinoEncoderOutput( last_hidden_state_vision=vision_features, last_hidden_state_text=text_features, hidden_states_vision=encoder_vision_states, @@ -1690,9 +1690,9 @@ def forward( ) -class GroundingDINODecoder(GroundingDINOPreTrainedModel): +class GroundingDinoDecoder(GroundingDinoPreTrainedModel): """ - Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDINODecoderLayer`]. + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`]. The decoder updates the query embeddings through multiple self-attention and cross-attention layers. @@ -1702,16 +1702,16 @@ class GroundingDINODecoder(GroundingDINOPreTrainedModel): - it also returns a stack of intermediate outputs and reference points from all decoding layers. Args: - config: GroundingDINOConfig + config: GroundingDinoConfig """ - def __init__(self, config: GroundingDINOConfig): + def __init__(self, config: GroundingDinoConfig): super().__init__(config) self.dropout = config.dropout self.layer_norm = nn.LayerNorm(config.d_model) - self.layers = nn.ModuleList([GroundingDINODecoderLayer(config) for _ in range(config.decoder_layers)]) - self.reference_points_head = GroundingDINOMLPPredictionHead( + self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = GroundingDinoMLPPredictionHead( config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 ) self.gradient_checkpointing = False @@ -1941,7 +1941,7 @@ def custom_forward(*inputs): ] if v is not None ) - return GroundingDINODecoderOutput( + return GroundingDinoDecoderOutput( last_hidden_state=hidden_states, intermediate_hidden_states=intermediate, intermediate_reference_points=intermediate_reference_points, @@ -1999,14 +1999,14 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen """, GROUNDING_DINO_START_DOCSTRING, ) -class GroundingDINOModel(GroundingDINOPreTrainedModel): - def __init__(self, config: GroundingDINOConfig): +class GroundingDinoModel(GroundingDinoPreTrainedModel): + def __init__(self, config: GroundingDinoConfig): super().__init__(config) # Create backbone + positional encoding - backbone = GroundingDINOConvEncoder(config) + backbone = GroundingDinoConvEncoder(config) position_embeddings = build_position_encoding(config) - self.backbone = GroundingDINOConvModel(backbone, position_embeddings) + self.backbone = GroundingDinoConvModel(backbone, position_embeddings) # Create input projection layers if config.num_feature_levels > 1: @@ -2040,14 +2040,14 @@ def __init__(self, config: GroundingDINOConfig): ) # Create text backbone - self.text_backbone = GroundingDINOTextPrenet(config.text_backbone_config) + self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config) self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) - self.encoder = GroundingDINOEncoder(config) - self.decoder = GroundingDINODecoder(config) + self.encoder = GroundingDinoEncoder(config) + self.decoder = GroundingDinoDecoder(config) self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) @@ -2061,11 +2061,11 @@ def __init__(self, config: GroundingDINOConfig): ): self.encoder_output_bbox_embed = self.decoder.bbox_embed else: - self.encoder_output_bbox_embed = GroundingDINOMLPPredictionHead( + self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - self.encoder_output_class_embed = GroundingDINOContrastiveEmbedding(config) + self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config) else: self.reference_points = nn.Embedding(config.num_queries, 4) @@ -2166,7 +2166,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) return object_query, output_proposals @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=GroundingDINOModelOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: Tensor, @@ -2185,7 +2185,7 @@ def forward( Examples: ```python - >>> from transformers import AutoProcessor, GroundingDINOModel + >>> from transformers import AutoProcessor, GroundingDinoModel >>> from PIL import Image >>> import requests @@ -2194,7 +2194,7 @@ def forward( >>> text = "a cat." >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") - >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) @@ -2315,9 +2315,9 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) - # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDINOEncoderOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, GroundingDINOEncoderOutput): - encoder_outputs = GroundingDINOEncoderOutput( + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput): + encoder_outputs = GroundingDinoEncoderOutput( last_hidden_state_vision=encoder_outputs[0], last_hidden_state_text=encoder_outputs[1], hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, @@ -2387,7 +2387,7 @@ def forward( return tuple_outputs - return GroundingDINOModelOutput( + return GroundingDinoModelOutput( init_reference_points=init_reference_points, last_hidden_state=decoder_outputs.last_hidden_state, intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, @@ -2411,19 +2411,19 @@ def forward( """, GROUNDING_DINO_START_DOCSTRING, ) -class GroundingDINOForObjectDetection(GroundingDINOPreTrainedModel): +class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"] - def __init__(self, config: GroundingDINOConfig): + def __init__(self, config: GroundingDinoConfig): super().__init__(config) # Deformable DETR encoder-decoder model - self.model = GroundingDINOModel(config) + self.model = GroundingDinoModel(config) # Detection heads on top - _class_embed = GroundingDINOContrastiveEmbedding(config) - _bbox_embed = GroundingDINOMLPPredictionHead( + _class_embed = GroundingDinoContrastiveEmbedding(config) + _bbox_embed = GroundingDinoMLPPredictionHead( input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) @@ -2448,7 +2448,7 @@ def _set_aux_loss(self, outputs_class, outputs_coord): return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=GroundingDINOObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -2456,7 +2456,7 @@ def forward( attention_mask: torch.LongTensor = None, token_type_ids: torch.LongTensor = None, pixel_mask: Optional[torch.BoolTensor] = None, - encoder_outputs: Optional[Union[GroundingDINOEncoderOutput, Tuple]] = None, + encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None, labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, @@ -2474,7 +2474,7 @@ def forward( Examples: ```python - >>> from transformers import AutoProcessor, GroundingDINOForObjectDetection + >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection >>> from PIL import Image >>> import requests @@ -2483,7 +2483,7 @@ def forward( >>> text = "a cat." >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") - >>> model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") >>> inputs = processor(images=image, text=text, return_tensors="pt") >>> outputs = model(**inputs) @@ -2560,12 +2560,12 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: # First: create the matcher - matcher = GroundingDINOHungarianMatcher( + matcher = GroundingDinoHungarianMatcher( class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost ) # Second: create the criterion losses = ["labels", "boxes", "cardinality"] - criterion = GroundingDINOLoss( + criterion = GroundingDinoLoss( matcher=matcher, num_classes=self.config.num_labels, focal_alpha=self.config.focal_alpha, @@ -2603,7 +2603,7 @@ def forward( return tuple_outputs - dict_outputs = GroundingDINOObjectDetectionOutput( + dict_outputs = GroundingDinoObjectDetectionOutput( loss=loss, loss_dict=loss_dict, logits=logits, @@ -2679,15 +2679,15 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f return loss.mean(1).sum() / num_boxes -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDINO -class GroundingDINOLoss(nn.Module): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino +class GroundingDinoLoss(nn.Module): """ - This class computes the losses for `GroundingDINOForObjectDetection`. The process happens in two steps: 1) we + This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box). Args: - matcher (`GroundingDINOHungarianMatcher`): + matcher (`GroundingDinoHungarianMatcher`): Module able to compute a matching between targets and proposals. num_classes (`int`): Number of object categories, omitting the special no-object category. @@ -2858,7 +2858,7 @@ def forward(self, outputs, targets): # Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead -class GroundingDINOMLPPredictionHead(nn.Module): +class GroundingDinoMLPPredictionHead(nn.Module): """ Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, height and width of a bounding box w.r.t. an image. @@ -2879,8 +2879,8 @@ def forward(self, x): return x -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDINO -class GroundingDINOHungarianMatcher(nn.Module): +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino +class GroundingDinoHungarianMatcher(nn.Module): """ This class computes an assignment between the targets and the predictions of the network. @@ -3078,8 +3078,8 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): return NestedTensor(tensor, mask) -# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDINOText -class GroundingDINOTextEmbeddings(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText +class GroundingDinoTextEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config): @@ -3143,8 +3143,8 @@ def forward( return embeddings -# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDINOText -class GroundingDINOTextSelfAttention(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText +class GroundingDinoTextSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): @@ -3251,7 +3251,7 @@ def forward( attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in GroundingDINOTextModel forward() function) + # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. @@ -3278,8 +3278,8 @@ def forward( return outputs -# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDINOText -class GroundingDINOTextSelfOutput(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText +class GroundingDinoTextSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) @@ -3293,12 +3293,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDINOText -class GroundingDINOTextAttention(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText +class GroundingDinoTextAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = GroundingDINOTextSelfAttention(config, position_embedding_type=position_embedding_type) - self.output = GroundingDINOTextSelfOutput(config) + self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = GroundingDinoTextSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): @@ -3343,8 +3343,8 @@ def forward( return outputs -# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDINOText -class GroundingDINOTextIntermediate(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText +class GroundingDinoTextIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) @@ -3359,8 +3359,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDINOText -class GroundingDINOTextOutput(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText +class GroundingDinoTextOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) @@ -3374,21 +3374,21 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDINOText -class GroundingDINOTextLayer(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText +class GroundingDinoTextLayer(nn.Module): def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 - self.attention = GroundingDINOTextAttention(config) + self.attention = GroundingDinoTextAttention(config) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = GroundingDINOTextAttention(config, position_embedding_type="absolute") - self.intermediate = GroundingDINOTextIntermediate(config) - self.output = GroundingDINOTextOutput(config) + self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute") + self.intermediate = GroundingDinoTextIntermediate(config) + self.output = GroundingDinoTextOutput(config) def forward( self, @@ -3461,12 +3461,12 @@ def feed_forward_chunk(self, attention_output): return layer_output -# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDINOText -class GroundingDINOTextEncoder(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText +class GroundingDinoTextEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.ModuleList([GroundingDINOTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( @@ -3555,8 +3555,8 @@ def forward( ) -# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDINOText -class GroundingDINOTextPooler(nn.Module): +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText +class GroundingDinoTextPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) @@ -3571,17 +3571,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output -class GroundingDINOTextPrenet(GroundingDINOPreTrainedModel): - config_class = GroundingDINOTextConfig +class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel): + config_class = GroundingDinoTextConfig def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config - self.embeddings = GroundingDINOTextEmbeddings(config) - self.encoder = GroundingDINOTextEncoder(config) + self.embeddings = GroundingDinoTextEmbeddings(config) + self.encoder = GroundingDinoTextEncoder(config) - self.pooler = GroundingDINOTextPooler(config) if add_pooling_layer else None + self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index ac3d44eaa758b4..20265e98c15c09 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -51,13 +51,13 @@ def get_phrases_from_posmap(posmaps, input_ids): return token_ids -class GroundingDINOProcessor(ProcessorMixin): +class GroundingDinoProcessor(ProcessorMixin): r""" Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a single processor. - [`GroundingDINOProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and - [`AutoTokenizer`]. See the docstring of [`~GroundingDINOProcessor.__call__`] and [`~GroundingDINOProcessor.decode`] + [`GroundingDinoProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and + [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] for more information. Args: @@ -68,7 +68,7 @@ class GroundingDINOProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "GroundingDINOImageProcessor" + image_processor_class = "GroundingDinoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 686f5d7d1a11d7..9a28b7b98548e9 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4043,21 +4043,21 @@ def __init__(self, *args, **kwargs): GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None -class GroundingDINOForObjectDetection(metaclass=DummyObject): +class GroundingDinoForObjectDetection(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GroundingDINOModel(metaclass=DummyObject): +class GroundingDinoModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GroundingDINOPreTrainedModel(metaclass=DummyObject): +class GroundingDinoPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 26c7a08f7b064c..352d88cf65ce44 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -247,7 +247,7 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) -class GroundingDINOImageProcessor(metaclass=DummyObject): +class GroundingDinoImageProcessor(metaclass=DummyObject): _backends = ["vision"] def __init__(self, *args, **kwargs): diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 3c24b9cedd4340..51bd5807991458 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -30,11 +30,11 @@ if is_vision_available(): from PIL import Image - from transformers import GroundingDINOImageProcessor + from transformers import GroundingDinoImageProcessor -# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDINO -class GroundingDINOImageProcessingTester(unittest.TestCase): +# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester with DeformableDetr->GroundingDino +class GroundingDinoImageProcessingTester(unittest.TestCase): def __init__( self, parent, @@ -81,7 +81,7 @@ def prepare_image_processor_dict(self): def get_expected_values(self, image_inputs, batched=False): """ - This function computes the expected height and width when providing images to GroundingDINOImageProcessor, + This function computes the expected height and width when providing images to GroundingDinoImageProcessor, assuming do_resize is set to True with a scalar size. """ if not batched: @@ -128,12 +128,12 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_torch @require_vision -# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDINO -class GroundingDINOImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): - image_processing_class = GroundingDINOImageProcessor if is_vision_available() else None +# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest with DeformableDetr->GroundingDino +class GroundingDinoImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None def setUp(self): - self.image_processor_tester = GroundingDINOImageProcessingTester(self) + self.image_processor_tester = GroundingDinoImageProcessingTester(self) @property def image_processor_dict(self): @@ -170,7 +170,7 @@ def test_call_pytorch_with_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} # encode them - image_processing = GroundingDINOImageProcessor() + image_processing = GroundingDinoImageProcessor() encoding = image_processing(images=image, annotations=target, return_tensors="pt") # verify pixel values @@ -216,7 +216,7 @@ def test_call_pytorch_with_coco_panoptic_annotations(self): masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") # encode them - image_processing = GroundingDINOImageProcessor(format="coco_panoptic") + image_processing = GroundingDinoImageProcessor(format="coco_panoptic") encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") # verify pixel values diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index f8fc49fd3754ea..fc41dfb3a2349c 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -22,8 +22,8 @@ from typing import Dict, List, Tuple from transformers import ( - GroundingDINOConfig, - GroundingDINOTextConfig, + GroundingDinoConfig, + GroundingDinoTextConfig, SwinConfig, is_torch_available, is_vision_available, @@ -47,7 +47,7 @@ if is_torch_available(): import torch - from transformers import GroundingDINOForObjectDetection, GroundingDINOModel + from transformers import GroundingDinoForObjectDetection, GroundingDinoModel from transformers.pytorch_utils import id_tensor_storage @@ -57,7 +57,7 @@ from transformers import AutoProcessor -class GroundingDINOModelTester: +class GroundingDinoModelTester: def __init__( self, parent, @@ -146,10 +146,10 @@ def get_config(self): out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) - text_backbone = GroundingDINOTextConfig( + text_backbone = GroundingDinoTextConfig( hidden_size=8, num_hidden_layers=2, num_attention_heads=2, intermediate_size=8, max_position_embeddings=8 ) - return GroundingDINOConfig( + return GroundingDinoConfig( d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, @@ -176,7 +176,7 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels): - model = GroundingDINOModel(config=config) + model = GroundingDinoModel(config=config) model.to(torch_device) model.eval() @@ -185,7 +185,7 @@ def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, la self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels): - model = GroundingDINOForObjectDetection(config=config) + model = GroundingDinoForObjectDetection(config=config) model.to(torch_device) model.eval() @@ -202,15 +202,15 @@ def create_and_check_object_detection_head_model(self, config, pixel_values, pix @require_torch -class GroundingDINOModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (GroundingDINOModel, GroundingDINOForObjectDetection) if is_torch_available() else () +class GroundingDinoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroundingDinoModel, GroundingDinoForObjectDetection) if is_torch_available() else () is_encoder_decoder = True test_torchscript = False test_pruning = False test_head_masking = False test_missing_keys = False pipeline_model_mapping = ( - {"feature-extraction": GroundingDINOModel, "object-detection": GroundingDINOForObjectDetection} + {"feature-extraction": GroundingDinoModel, "object-detection": GroundingDinoForObjectDetection} if is_torch_available() else {} ) @@ -220,7 +220,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: - if model_class.__name__ == "GroundingDINOForObjectDetection": + if model_class.__name__ == "GroundingDinoForObjectDetection": labels = [] for i in range(self.model_tester.batch_size): target = {} @@ -243,8 +243,8 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict def setUp(self): - self.model_tester = GroundingDINOModelTester(self) - self.config_tester = ConfigTester(self, config_class=GroundingDINOConfig, has_text_modality=False) + self.model_tester = GroundingDinoModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroundingDinoConfig, has_text_modality=False) def test_config(self): # we don't test common_properties and arguments_init as these don't apply for Grounding DINO @@ -325,7 +325,7 @@ def test_attention_outputs(self): if "labels" in inputs_dict: correct_outlen += 1 # loss is added to beginning # Object Detection model returns pred_logits and pred_boxes - if model_class.__name__ == "GroundingDINOForObjectDetection": + if model_class.__name__ == "GroundingDinoForObjectDetection": correct_outlen += 2 self.assertEqual(out_len, correct_outlen) @@ -580,7 +580,7 @@ def test_different_timm_backbone(self): with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - if model_class.__name__ == "GroundingDINOForObjectDetection": + if model_class.__name__ == "GroundingDinoForObjectDetection": expected_shape = ( self.model_tester.batch_size, self.model_tester.num_queries, @@ -617,7 +617,7 @@ def test_initialization(self): ) def test_two_stage_training(self): - model_class = GroundingDINOForObjectDetection + model_class = GroundingDinoForObjectDetection config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True config.two_stage = True @@ -655,9 +655,9 @@ def test_tied_weights_keys(self): for i in range(len(tied_params)): tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] - # GroundingDINO when sharing weights also uses the shared ones in GroundingDINODecoder + # GroundingDino when sharing weights also uses the shared ones in GroundingDinoDecoder # Therefore, differently from DeformableDetr, we expect the group lens to be 2 - # one for self.bbox_embed in GroundingDINOForObejectDetection and another one + # one for self.bbox_embed in GroundingDinoForObejectDetection and another one # in the decoder tied_params = [group for group in tied_params if len(group) > 2] self.assertListEqual( @@ -684,13 +684,13 @@ def prepare_text(): @require_timm @require_vision @slow -class GroundingDINOModelIntegrationTests(unittest.TestCase): +class GroundingDinoModelIntegrationTests(unittest.TestCase): @cached_property def default_processor(self): return AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") if is_vision_available() else None def test_inference_object_detection_head(self): - model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device) + model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny").to(torch_device) processor = self.default_processor image = prepare_img() @@ -749,7 +749,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self): encoding = processor(images=image, text=text, return_tensors="pt") # 1. run model on CPU - model = GroundingDINOForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") with torch.no_grad(): cpu_outputs = model(**encoding) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 13a133c80cc861..44283bc69737e6 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -21,7 +21,7 @@ import numpy as np import pytest -from transformers import BertTokenizer, BertTokenizerFast, GroundingDINOProcessor +from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available @@ -30,11 +30,11 @@ if is_vision_available(): from PIL import Image - from transformers import GroundingDINOImageProcessor + from transformers import GroundingDinoImageProcessor @require_vision -class GroundingDINOProcessorTest(unittest.TestCase): +class GroundingDinoProcessorTest(unittest.TestCase): def setUp(self): self.tmpdirname = tempfile.mkdtemp() @@ -65,9 +65,9 @@ def get_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs): return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDINO + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDino def get_image_processor(self, **kwargs): - return GroundingDINOImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + return GroundingDinoImageProcessor.from_pretrained(self.tmpdirname, **kwargs) # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.tearDown def tearDown(self): @@ -85,19 +85,19 @@ def prepare_image_inputs(self): return image_inputs - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() - processor_slow = GroundingDINOProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) processor_slow.save_pretrained(self.tmpdirname) - processor_slow = GroundingDINOProcessor.from_pretrained(self.tmpdirname, use_fast=False) + processor_slow = GroundingDinoProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor_fast = GroundingDINOProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast = GroundingDinoProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) processor_fast.save_pretrained(self.tmpdirname) - processor_fast = GroundingDINOProcessor.from_pretrained(self.tmpdirname) + processor_fast = GroundingDinoProcessor.from_pretrained(self.tmpdirname) self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) @@ -107,18 +107,18 @@ def test_save_load_pretrained_default(self): self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, GroundingDINOImageProcessor) - self.assertIsInstance(processor_fast.image_processor, GroundingDINOImageProcessor) + self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor) + self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor) - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDINO,GroundingDINOTokenizer->BertTokenizer + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer def test_save_load_pretrained_additional_features(self): - processor = GroundingDINOProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor = GroundingDinoProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - processor = GroundingDINOProcessor.from_pretrained( + processor = GroundingDinoProcessor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 ) @@ -126,14 +126,14 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, GroundingDINOImageProcessor) + self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor) - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDINO + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino def test_image_processor(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() - processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) image_input = self.prepare_image_inputs() @@ -143,12 +143,12 @@ def test_image_processor(self): for key in input_image_proc.keys(): self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDINO + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino def test_tokenizer(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() - processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) input_str = "lower newer" @@ -163,7 +163,7 @@ def test_processor(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() - processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -178,12 +178,12 @@ def test_processor(self): with pytest.raises(ValueError): processor() - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDINO + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDino def test_tokenizer_decode(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() - processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] @@ -192,12 +192,12 @@ def test_tokenizer_decode(self): self.assertListEqual(decoded_tok, decoded_processor) - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDINO + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDino def test_model_input_names(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() - processor = GroundingDINOProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) input_str = "lower newer" image_input = self.prepare_image_inputs() diff --git a/utils/check_repo.py b/utils/check_repo.py index 5ecf0aa9a7bf07..798d89e8ca6890 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -232,7 +232,7 @@ "FlavaMultimodalModel", "GPT2DoubleHeadsModel", "GPTSw3DoubleHeadsModel", - "GroundingDINOTextPrenet", + "GroundingDinoTextPrenet", "InstructBlipVisionModel", "InstructBlipQFormerModel", "LayoutLMForQuestionAnswering", From 559de31855fc0b78d1cc8d5b8a47cd5d65349b16 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 14:35:57 -0300 Subject: [PATCH 155/252] Fixed GroundingDinoProcessor docstrings --- .../models/grounding_dino/processing_grounding_dino.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 20265e98c15c09..0e658a42f77baa 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -56,13 +56,13 @@ class GroundingDinoProcessor(ProcessorMixin): Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a single processor. - [`GroundingDinoProcessor`] offers all the functionalities of [`DeformableDetrImageProcessor`] and + [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] for more information. Args: - image_processor (`DeformableDetrImageProcessor`): - An instance of [`DeformableDetrImageProcessor`]. The image processor is a required input. + image_processor (`GroundingDinoImageProcessor`): + An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input. tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ @@ -95,7 +95,7 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - This method uses [`DeformableDetrImageProcessor.__call__`] method to prepare image(s) for the model, and + This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. Please refer to the docstring of the above two methods for more information. From fef983e21c1a28bdecbf3000cf2fe222f32152c4 Mon Sep 17 00:00:00 2001 From: EduardoPach Date: Fri, 8 Dec 2023 14:47:03 -0300 Subject: [PATCH 156/252] loading MSDA kernels in the modeling file --- .../models/grounding_dino/load_custom.py | 49 ------------------- .../grounding_dino/modeling_grounding_dino.py | 37 +++++++++++++- 2 files changed, 35 insertions(+), 51 deletions(-) delete mode 100644 src/transformers/models/grounding_dino/load_custom.py diff --git a/src/transformers/models/grounding_dino/load_custom.py b/src/transformers/models/grounding_dino/load_custom.py deleted file mode 100644 index 97b8f09fb5f446..00000000000000 --- a/src/transformers/models/grounding_dino/load_custom.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Loading of Grounding DINO's CUDA kernels""" -import os -from pathlib import Path - - -def load_cuda_kernels(): - from torch.utils.cpp_extension import load - - root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino" - src_files = [ - root / filename - for filename in [ - "vision.cpp", - os.path.join("cpu", "ms_deform_attn_cpu.cpp"), - os.path.join("cuda", "ms_deform_attn_cuda.cu"), - ] - ] - - load( - "MultiScaleDeformableAttention", - src_files, - with_cuda=True, - extra_include_paths=[str(root)], - extra_cflags=["-DWITH_CUDA=1"], - extra_cuda_cflags=[ - "-DCUDA_HAS_FP16=1", - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - ], - ) - - import MultiScaleDeformableAttention as MSDA - - return MSDA diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 664f549603b6e3..0e2e6a4baf23cd 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -14,11 +14,12 @@ # limitations under the License. """ PyTorch Grounding DINO model.""" - import copy import math +import os import warnings from dataclasses import dataclass +from pathlib import Path from typing import Dict, List, Optional, Tuple, Union import torch @@ -47,11 +48,43 @@ from ...utils import is_ninja_available, logging from ..auto import AutoBackbone from .configuration_grounding_dino import GroundingDinoConfig, GroundingDinoTextConfig -from .load_custom import load_cuda_kernels logger = logging.get_logger(__name__) + +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + import MultiScaleDeformableAttention as MSDA + + return MSDA + + # Move this to not compile only when importing, this needs to happen later, like in __init__. if is_torch_cuda_available() and is_ninja_available(): logger.info("Loading custom CUDA kernels...") From 9994ee0162eeb3a38bb520dfca03991e98a5a62a Mon Sep 17 00:00:00 2001 From: Niels Date: Mon, 11 Dec 2023 11:09:24 +0100 Subject: [PATCH 157/252] Fix copies --- .../models/grounding_dino/image_processing_grounding_dino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index d98892922c0024..4565b744b0a774 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -331,10 +331,13 @@ def prepare_coco_detection_annotation( if annotations and "keypoints" in annotations[0]: keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] num_keypoints = keypoints.shape[0] keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints - new_target["keypoints"] = keypoints[keep] + new_target["keypoints"] = keypoints if return_segmentation_masks: segmentation_masks = [obj["segmentation"] for obj in annotations] From 14c839dfc8de22b02e29218a4e616a8403af09b6 Mon Sep 17 00:00:00 2001 From: Niels Date: Wed, 31 Jan 2024 22:35:36 +0100 Subject: [PATCH 158/252] Replace nn.multiheadattention --- .../convert_grounding_dino_to_hf.py | 39 ++++++- .../grounding_dino/modeling_grounding_dino.py | 110 ++++++++++++++++-- 2 files changed, 138 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 066e0a209a0f53..5b7290bdfd3184 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -280,7 +280,7 @@ def rename_key(dct, old, new): # we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): +def read_in_q_k_v_encoder(state_dict, config): ########################################## VISION BACKBONE - START embed_dim = config.backbone_config.embed_dim for layer, depth in enumerate(config.backbone_config.depths): @@ -313,6 +313,25 @@ def read_in_q_k_v(state_dict, config): ########################################## VISION BACKBONE - END +def read_in_q_k_v_decoder(state_dict, config): + hidden_size = config.hidden_size + for idx in range(config.decoder_layers): + # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] + + # We will verify our results on an image of cute cats def prepare_img(): url = "http://images.cocodataset.org/val2017/000000039769.jpg" @@ -345,18 +364,24 @@ def convert_grounding_dino_checkpoint(args): original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} + for name, param in original_state_dict.items(): + print(name, param.shape) + # Rename keys new_state_dict = original_state_dict.copy() rename_keys = create_rename_keys(original_state_dict, config) for src, dest in rename_keys: rename_key(new_state_dict, src, dest) - read_in_q_k_v(new_state_dict, config) + read_in_q_k_v_encoder(new_state_dict, config) + read_in_q_k_v_decoder(new_state_dict, config) # Load HF model model = GroundingDinoForObjectDetection(config) model.eval() missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) # Load and process test image image = prepare_img() @@ -374,7 +399,15 @@ def convert_grounding_dino_checkpoint(args): # Running forward with torch.no_grad(): - _ = model(**inputs) + outputs = model(**inputs) + + print(outputs.logits[0, :3, :3]) + + expected_slice = torch.tensor( + [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] + ) + + assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) if pytorch_dump_folder_path is not None: model.save_pretrained(pytorch_dump_folder_path) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 0e2e6a4baf23cd..56a7f9c55114dc 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1273,18 +1273,104 @@ def forward( ) +class GroundingDinoMultiheadAttention(nn.Module): + """Equivalent implementation of nn.MultiheadAttention with batch_first=True.""" + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(config.attention_dropout) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(queries) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + else: + key_layer = self.transpose_for_scores(self.key(keys)) + value_layer = self.transpose_for_scores(self.value(values)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + context_layer = self.out_proj(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + class GroundingDinoDecoderLayer(nn.Module): def __init__(self, config: GroundingDinoConfig): super().__init__() self.embed_dim = config.d_model # self-attention - self.self_attn = nn.MultiheadAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - dropout=config.attention_dropout, - batch_first=True, - ) + mha_config = copy.deepcopy(config) + mha_config.num_attention_heads = config.decoder_attention_heads + self.self_attn = GroundingDinoMultiheadAttention(mha_config) + # self.self_attn = nn.MultiheadAttention( + # embed_dim=self.embed_dim, + # num_heads=config.decoder_attention_heads, + # dropout=config.attention_dropout, + # batch_first=True, + # ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout @@ -1330,10 +1416,18 @@ def forward( residual = hidden_states # Self Attention - q = k = self.with_pos_embed(hidden_states, position_embeddings) + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, self_attn_weights = self.self_attn( - query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=self_attn_mask, + output_attentions=True, ) + # q = k = self.with_pos_embed(hidden_states, position_embeddings) + # hidden_states, self_attn_weights = self.self_attn( + # query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False + # ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states From 5a6f2583f9eee532b51c475bfd160eb001fea3cc Mon Sep 17 00:00:00 2001 From: Niels Date: Thu, 1 Feb 2024 09:09:38 +0100 Subject: [PATCH 159/252] Replace nn.multiheadattention --- .../convert_grounding_dino_to_hf.py | 40 +++++++++++--- .../grounding_dino/modeling_grounding_dino.py | 52 +++++-------------- 2 files changed, 44 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 5b7290bdfd3184..3d9b7673fbef38 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -331,6 +331,24 @@ def read_in_q_k_v_decoder(state_dict, config): state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] + # read in weights + bias of cross-attention + in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias") + + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:] + # We will verify our results on an image of cute cats def prepare_img(): @@ -351,6 +369,7 @@ def convert_grounding_dino_checkpoint(args): model_name = args.model_name pytorch_dump_folder_path = args.pytorch_dump_folder_path push_to_hub = args.push_to_hub + verify_logits = args.verify_logits checkpoint_mapping = { "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", @@ -397,17 +416,19 @@ def convert_grounding_dino_checkpoint(args): assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) - # Running forward - with torch.no_grad(): - outputs = model(**inputs) + if verify_logits: + # Running forward + with torch.no_grad(): + outputs = model(**inputs) - print(outputs.logits[0, :3, :3]) + print(outputs.logits[0, :3, :3]) - expected_slice = torch.tensor( - [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] - ) + expected_slice = torch.tensor( + [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] + ) - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) + assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) + print("Looks ok!") if pytorch_dump_folder_path is not None: model.save_pretrained(pytorch_dump_folder_path) @@ -434,6 +455,9 @@ def convert_grounding_dino_checkpoint(args): parser.add_argument( "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." ) + parser.add_argument( + "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." + ) args = parser.parse_args() convert_grounding_dino_checkpoint(args) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 56a7f9c55114dc..f99ec9ab2c9717 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1274,7 +1274,7 @@ def forward( class GroundingDinoMultiheadAttention(nn.Module): - """Equivalent implementation of nn.MultiheadAttention with batch_first=True.""" + """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`.""" def __init__(self, config): super().__init__() @@ -1307,26 +1307,11 @@ def forward( keys: torch.Tensor, values: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor]: - mixed_query_layer = self.query(queries) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention: - key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - else: - key_layer = self.transpose_for_scores(self.key(keys)) - value_layer = self.transpose_for_scores(self.value(values)) - - query_layer = self.transpose_for_scores(mixed_query_layer) + query_layer = self.transpose_for_scores(self.query(queries)) + key_layer = self.transpose_for_scores(self.key(keys)) + value_layer = self.transpose_for_scores(self.value(values)) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) @@ -1365,24 +1350,14 @@ def __init__(self, config: GroundingDinoConfig): mha_config = copy.deepcopy(config) mha_config.num_attention_heads = config.decoder_attention_heads self.self_attn = GroundingDinoMultiheadAttention(mha_config) - # self.self_attn = nn.MultiheadAttention( - # embed_dim=self.embed_dim, - # num_heads=config.decoder_attention_heads, - # dropout=config.attention_dropout, - # batch_first=True, - # ) + self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention text - self.encoder_attn_text = nn.MultiheadAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - dropout=config.attention_dropout, - batch_first=True, - ) + self.encoder_attn_text = GroundingDinoMultiheadAttention(mha_config) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention self.encoder_attn = GroundingDinoMultiscaleDeformableAttention( @@ -1424,10 +1399,6 @@ def forward( attention_mask=self_attn_mask, output_attentions=True, ) - # q = k = self.with_pos_embed(hidden_states, position_embeddings) - # hidden_states, self_attn_weights = self.self_attn( - # query=q, key=k, value=hidden_states, attn_mask=self_attn_mask, average_attn_weights=False - # ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states @@ -1436,12 +1407,13 @@ def forward( second_residual = hidden_states # Cross-Attention Text + queries = self.with_pos_embed(hidden_states, position_embeddings) hidden_states, text_cross_attn_weights = self.encoder_attn_text( - query=self.with_pos_embed(hidden_states, position_embeddings), - key=text_encoder_hidden_states, - value=text_encoder_hidden_states, - key_padding_mask=text_encoder_attention_mask, - average_attn_weights=False, + queries=queries, + keys=text_encoder_hidden_states, + values=text_encoder_hidden_states, + attention_mask=text_encoder_attention_mask, + output_attentions=True, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) From 9fa83da7f00c2769d9111a8c9f3062810e4a43d7 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Sun, 4 Feb 2024 01:17:41 +0100 Subject: [PATCH 160/252] Fixed inputs for GroundingDinoMultiheadAttention & order of modules --- .../grounding_dino/modeling_grounding_dino.py | 3096 ++++++++--------- 1 file changed, 1544 insertions(+), 1552 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 56a7f9c55114dc..7edaa5dbbfb827 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -761,16 +761,97 @@ def forward( return output, attention_weights +class GroundingDinoMultiheadAttention(nn.Module): + """Equivalent implementation of nn.MultiheadAttention with batch_first=True.""" + + def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0): + super().__init__() + if embed_dim % num_heads != 0: + raise ValueError( + f"The hidden size ({embed_dim}) is not a multiple of the number of attention " f"heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(embed_dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(embed_dim, self.all_head_size) + self.key = nn.Linear(embed_dim, self.all_head_size) + self.value = nn.Linear(embed_dim, self.all_head_size) + + self.out_proj = nn.Linear(embed_dim, embed_dim) + + self.dropout = nn.Dropout(dropout) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(queries) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + else: + key_layer = self.transpose_for_scores(self.key(keys)) + value_layer = self.transpose_for_scores(self.value(values)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + context_layer = self.out_proj(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + class GroundingDinoTextEnhancerLayer(nn.Module): """Vanilla Transformer with text embeddings as input""" def __init__(self, config): super().__init__() - self.self_attn = nn.MultiheadAttention( + self.self_attn = GroundingDinoMultiheadAttention( embed_dim=config.d_model, num_heads=config.encoder_attention_heads // 2, dropout=config.text_enhancer_dropout, - batch_first=True, ) # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) @@ -1085,7 +1166,6 @@ def forward( return (vision_features, vision_attn), (text_features, text_attn) -# NOTE just renamed the class class GroundingDinoDeformableLayer(nn.Module): def __init__(self, config: GroundingDinoConfig): super().__init__() @@ -1273,115 +1353,27 @@ def forward( ) -class GroundingDinoMultiheadAttention(nn.Module): - """Equivalent implementation of nn.MultiheadAttention with batch_first=True.""" - - def __init__(self, config): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " - f"heads ({config.num_attention_heads})" - ) - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) - - self.dropout = nn.Dropout(config.attention_dropout) - - def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - queries: torch.Tensor, - keys: torch.Tensor, - values: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - mixed_query_layer = self.query(queries) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention: - key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - else: - key_layer = self.transpose_for_scores(self.key(keys)) - value_layer = self.transpose_for_scores(self.value(values)) - - query_layer = self.transpose_for_scores(mixed_query_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) - - context_layer = self.out_proj(context_layer) - - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - - return outputs - - class GroundingDinoDecoderLayer(nn.Module): def __init__(self, config: GroundingDinoConfig): super().__init__() self.embed_dim = config.d_model # self-attention - mha_config = copy.deepcopy(config) - mha_config.num_attention_heads = config.decoder_attention_heads - self.self_attn = GroundingDinoMultiheadAttention(mha_config) - # self.self_attn = nn.MultiheadAttention( - # embed_dim=self.embed_dim, - # num_heads=config.decoder_attention_heads, - # dropout=config.attention_dropout, - # batch_first=True, - # ) + self.self_attn = GroundingDinoMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention text - self.encoder_attn_text = nn.MultiheadAttention( + self.encoder_attn_text = GroundingDinoMultiheadAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - batch_first=True, ) self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim) # cross-attention @@ -2077,1732 +2069,1732 @@ def custom_forward(*inputs): ) -SPECIAL_TOKENS = [101, 102, 1012, 1029] +# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText +class GroundingDinoTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" -def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: - """Generate attention mask between each pair of special tokens and positional ids. - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. - Returns: - `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: - - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) - - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) - """ - batch_size, num_token = input_ids.shape - # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens - special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() - for special_token in SPECIAL_TOKENS: - special_tokens_mask |= input_ids == special_token + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - # idxs: each row is a list of indices of special tokens - idxs = torch.nonzero(special_tokens_mask) + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) - # generate attention mask and positional ids - attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) - position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) - previous_col = 0 - for i in range(idxs.shape[0]): - row, col = idxs[i] - if (col == 0) or (col == num_token - 1): - attention_mask[row, col, col] = True - position_ids[row, col] = 0 + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() else: - attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True - position_ids[row, previous_col + 1 : col + 1] = torch.arange( - 0, col - previous_col, device=input_ids.device - ) - - previous_col = col + input_shape = inputs_embeds.size()[:-1] - return attention_mask, position_ids.to(torch.long) + seq_length = input_shape[1] + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] -@add_start_docstrings( - """ - The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw - hidden-states without any specific head on top. - """, - GROUNDING_DINO_START_DOCSTRING, -) -class GroundingDinoModel(GroundingDinoPreTrainedModel): - def __init__(self, config: GroundingDinoConfig): - super().__init__(config) + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) - # Create backbone + positional encoding - backbone = GroundingDinoConvEncoder(config) - position_embeddings = build_position_encoding(config) - self.backbone = GroundingDinoConvModel(backbone, position_embeddings) + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) - # Create input projection layers - if config.num_feature_levels > 1: - num_backbone_outs = len(backbone.intermediate_channel_sizes) - input_proj_list = [] - for _ in range(num_backbone_outs): - in_channels = backbone.intermediate_channel_sizes[_] - input_proj_list.append( - nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=1), - nn.GroupNorm(32, config.d_model), - ) - ) - for _ in range(config.num_feature_levels - num_backbone_outs): - input_proj_list.append( - nn.Sequential( - nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), - nn.GroupNorm(32, config.d_model), - ) - ) - in_channels = config.d_model - self.input_proj_vision = nn.ModuleList(input_proj_list) - else: - self.input_proj_vision = nn.ModuleList( - [ - nn.Sequential( - nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), - nn.GroupNorm(32, config.d_model), - ) - ] + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText +class GroundingDinoTextSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" ) - # Create text backbone - self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config) - self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size - if config.embedding_init_target or not config.two_stage: - self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) - self.encoder = GroundingDinoEncoder(config) - self.decoder = GroundingDinoDecoder(config) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + self.is_decoder = config.is_decoder - if config.two_stage: - self.enc_output = nn.Linear(config.d_model, config.d_model) - self.enc_output_norm = nn.LayerNorm(config.d_model) - if ( - config.two_stage_bbox_embed_share - and config.decoder_bbox_embed_share - and self.decoder.bbox_embed is not None - ): - self.encoder_output_bbox_embed = self.decoder.bbox_embed - else: - self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead( - input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 - ) + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) - self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config) - else: - self.reference_points = nn.Embedding(config.num_queries, 4) + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) - self.post_init() + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None - def get_encoder(self): - return self.encoder + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) - def get_decoder(self): - return self.decoder + query_layer = self.transpose_for_scores(mixed_query_layer) - def freeze_backbone(self): - for name, param in self.backbone.conv_encoder.model.named_parameters(): - param.requires_grad_(False) + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) - def unfreeze_backbone(self): - for name, param in self.backbone.conv_encoder.model.named_parameters(): - param.requires_grad_(True) + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - def get_valid_ratio(self, mask): - """Get the valid ratio of all feature maps.""" + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r - _, height, width = mask.shape - valid_height = torch.sum(mask[:, :, 0], 1) - valid_width = torch.sum(mask[:, 0, :], 1) - valid_ratio_heigth = valid_height.float() / height - valid_ratio_width = valid_width.float() / width - valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) - return valid_ratio + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - def get_proposal_pos_embed(self, proposals): - """Get the position embedding of the proposals.""" + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - num_pos_feats = self.config.d_model // 2 - temperature = 10000 - scale = 2 * math.pi + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function) + attention_scores = attention_scores + attention_mask - dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) - # batch_size, num_queries, 4 - proposals = proposals.sigmoid() * scale - # batch_size, num_queries, 4, 128 - pos = proposals[:, :, :, None] / dim_t - # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 - pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) - return pos + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) - def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): - """Generate the encoder output proposals from encoded enc_output. + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) - Args: - enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. - padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. - spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask - Returns: - `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. - - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to - directly predict a bounding box. (without the need of a decoder) - - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse - sigmoid. - """ - batch_size = enc_output.shape[0] - proposals = [] - current_position = 0 - for level, (height, width) in enumerate(spatial_shapes): - mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view( - batch_size, height, width, 1 - ) - valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) - valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + context_layer = torch.matmul(attention_probs, value_layer) - grid_y, grid_x = meshgrid( - torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), - torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), - indexing="ij", - ) - grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) - scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) - grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale - width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) - proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) - proposals.append(proposal) - current_position += height * width + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) - output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid - output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) - output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs - # assign each pixel as an object query - object_query = enc_output - object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) - object_query = object_query.masked_fill(~output_proposals_valid, float(0)) - object_query = self.enc_output_norm(self.enc_output(object_query)) - return object_query, output_proposals - @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: Tensor, - input_ids: Tensor, - token_type_ids: Tensor = None, - attention_mask: Tensor = None, - pixel_mask: Optional[Tensor] = None, - encoder_outputs=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - Returns: +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText +class GroundingDinoTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) - Examples: + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states - ```python - >>> from transformers import AutoProcessor, GroundingDinoModel - >>> from PIL import Image - >>> import requests - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> text = "a cat." +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText +class GroundingDinoTextAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = GroundingDinoTextSelfOutput(config) + self.pruned_heads = set() - >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") - >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) - >>> inputs = processor(images=image, text=text, return_tensors="pt") - >>> outputs = model(**inputs) + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - >>> last_hidden_states = outputs.last_hidden_state - >>> list(last_hidden_states.shape) - [1, 900, 256] - ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs - text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText +class GroundingDinoTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act - if token_type_ids is None: - token_type_ids = torch.zeros_like(input_ids) + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states - text_token_mask = attention_mask.bool() # just to avoid renaming everywhere - max_text_len = self.config.max_text_len - if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] - position_ids = position_ids[:, :max_text_len] - input_ids = input_ids[:, :max_text_len] - token_type_ids = token_type_ids[:, :max_text_len] - text_token_mask = text_token_mask[:, :max_text_len] +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText +class GroundingDinoTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) - # Extract text features from text backbone - text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[ - "last_hidden_state" - ] - text_features = self.text_projection(text_features) + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states - batch_size, num_channels, height, width = pixel_values.shape - device = pixel_values.device - if pixel_mask is None: - pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText +class GroundingDinoTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GroundingDinoTextAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute") + self.intermediate = GroundingDinoTextIntermediate(config) + self.output = GroundingDinoTextOutput(config) - # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) - # First, sent pixel_values + pixel_mask through Backbone to obtain the features - # which is a list of tuples - vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] - # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) - sources = [] - masks = [] - for level, (source, mask) in enumerate(vision_features): - sources.append(self.input_proj_vision[level](source)) - masks.append(mask) - if mask is None: - raise ValueError("No attention mask was provided") + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage - if self.config.num_feature_levels > len(sources): - _len_sources = len(sources) - for level in range(_len_sources, self.config.num_feature_levels): - if level == _len_sources: - source = self.input_proj_vision[level](vision_features[-1][0]) - else: - source = self.input_proj_vision[level](sources[-1]) - mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] - pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) - sources.append(source) - masks.append(mask) - position_embeddings_list.append(pos_l) + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) - # Create queries - query_embeds = None - if self.config.embedding_init_target or self.config.two_stage: - query_embeds = self.query_position_embeddings.weight + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights - # Prepare encoder inputs (by flattening) - source_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): - batch_size, num_channels, height, width = source.shape - spatial_shape = (height, width) - spatial_shapes.append(spatial_shape) - source = source.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) - source_flatten.append(source) - mask_flatten.append(mask) - source_flatten = torch.cat(source_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) - valid_ratios = valid_ratios.float() + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value - # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder - # Also provide spatial_shapes, level_start_index and valid_ratios - if encoder_outputs is None: - encoder_outputs = self.encoder( - vision_features=source_flatten, - vision_attention_mask=~mask_flatten, - vision_position_embedding=lvl_pos_embed_flatten, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - text_features=text_features, - text_attention_mask=~text_token_mask, - text_position_embedding=None, - text_self_attention_masks=text_self_attention_masks, - text_position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput): - encoder_outputs = GroundingDinoEncoderOutput( - last_hidden_state_vision=encoder_outputs[0], - last_hidden_state_text=encoder_outputs[1], - hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, - hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, - attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None, - ) + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs - # Fifth, prepare decoder inputs - enc_outputs_class = None - enc_outputs_coord_logits = None - if self.config.two_stage: - object_query_embedding, output_proposals = self.gen_encoder_output_proposals( - encoder_outputs[0], ~mask_flatten, spatial_shapes - ) + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) - # hack implementation for two-stage Deformable DETR - # apply a detection head to each pixel (A.4 in paper) - # linear projection for bounding box binary classification (i.e. foreground and background) - enc_outputs_class = self.encoder_output_class_embed( - object_query_embedding, encoder_outputs[1], text_token_mask - ) - # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) - delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) - enc_outputs_coord_logits = delta_bbox + output_proposals + return outputs - # only keep top scoring `config.num_queries` proposals - topk = self.config.num_queries - topk_logits = enc_outputs_class.max(-1)[0] - topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] - topk_coords_logits = torch.gather( - enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) - ) + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output - topk_coords_logits = topk_coords_logits.detach() - reference_points = topk_coords_logits.sigmoid() - init_reference_points = reference_points - if query_embeds is not None: - target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText +class GroundingDinoTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) else: - target = torch.gather( - object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) - ).detach() - else: - target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) - reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() - init_reference_points = reference_points + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) - decoder_outputs = self.decoder( - inputs_embeds=target, - vision_encoder_hidden_states=encoder_outputs[0], - vision_encoder_attention_mask=mask_flatten, - text_encoder_hidden_states=encoder_outputs[1], - text_encoder_attention_mask=~text_token_mask, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - self_attn_mask=None, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) - tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) - return tuple_outputs - return GroundingDinoModelOutput( - init_reference_points=init_reference_points, - last_hidden_state=decoder_outputs.last_hidden_state, - intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, - intermediate_reference_points=decoder_outputs.intermediate_reference_points, - decoder_hidden_states=decoder_outputs.hidden_states, - decoder_attentions=decoder_outputs.attentions, - encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, - encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, - encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, - encoder_hidden_states_text=encoder_outputs.hidden_states_text, - encoder_attentions=encoder_outputs.attentions, - enc_outputs_class=enc_outputs_class, - enc_outputs_coord_logits=enc_outputs_coord_logits, - ) +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText +class GroundingDinoTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output -@add_start_docstrings( - """ - Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, - for tasks such as COCO detection. - """, - GROUNDING_DINO_START_DOCSTRING, -) -class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel): - # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"] - def __init__(self, config: GroundingDinoConfig): - super().__init__(config) +class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel): + config_class = GroundingDinoTextConfig - # Deformable DETR encoder-decoder model - self.model = GroundingDinoModel(config) + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config - # Detection heads on top - _class_embed = GroundingDinoContrastiveEmbedding(config) - _bbox_embed = GroundingDinoMLPPredictionHead( - input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 - ) + self.embeddings = GroundingDinoTextEmbeddings(config) + self.encoder = GroundingDinoTextEncoder(config) - if config.decoder_bbox_embed_share: - self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) - else: - self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) - self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) - # hack implementation for two-stage - self.model.decoder.bbox_embed = self.bbox_embed - self.model.decoder.class_embed = self.class_embed + self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None # Initialize weights and apply final processing self.post_init() - # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py - @torch.jit.unused - def _set_aux_loss(self, outputs_class, outputs_coord): - # this is a workaround to make torchscript happy, as torchscript - # doesn't support dictionary with non-homogeneous values, such - # as a dict having both a Tensor and a list. - return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + def get_input_embeddings(self): + return self.embeddings.word_embeddings - @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: torch.FloatTensor, - input_ids: torch.LongTensor, - attention_mask: torch.LongTensor = None, - token_type_ids: torch.LongTensor = None, - pixel_mask: Optional[torch.BoolTensor] = None, - encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None, - labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ): - r""" - labels (`List[Dict]` of len `(batch_size,)`, *optional*): - Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the - following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch - respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes - in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. - - Returns: - - Examples: + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict - ```python - >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection - >>> from PIL import Image - >>> import requests + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> text = "a cat." + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device - >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") - >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) - >>> inputs = processor(images=image, text=text, return_tensors="pt") - >>> outputs = model(**inputs) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - >>> # convert outputs (bounding boxes and class logits) to COCO API - >>> target_sizes = torch.tensor([image.size[::-1]]) - >>> results = processor.image_processor.post_process_object_detection( - ... outputs, threshold=0.35, target_sizes=target_sizes - ... )[0] - >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): - ... box = [round(i, 2) for i in box.tolist()] - ... print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}") - Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83] - Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89] - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs - outputs = self.model( - pixel_values=pixel_values, + embedding_output = self.embeddings( input_ids=input_ids, - attention_mask=attention_mask, + position_ids=position_ids, token_type_ids=token_type_ids, - pixel_mask=pixel_mask, - encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - # index for encoder_last_hidden_state_text - idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) - - hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] - enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] - init_reference = outputs.init_reference_points if return_dict else outputs[0] - inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] - - # class logits + predicted bounding boxes - outputs_classes = [] - outputs_coords = [] + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] - for level in range(hidden_states.shape[1]): - if level == 0: - reference = init_reference - else: - reference = inter_references[:, level - 1] - reference = inverse_sigmoid(reference) - outputs_class = self.class_embed[level]( - vision_hidden_state=hidden_states[:, level], - text_hidden_state=enc_text_hidden_state, - text_token_mask=attention_mask.bool(), - ) - delta_bbox = self.bbox_embed[level](hidden_states[:, level]) - if reference.shape[-1] == 4: - outputs_coord_logits = delta_bbox + reference - elif reference.shape[-1] == 2: - delta_bbox[..., :2] += reference - outputs_coord_logits = delta_bbox - else: - raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") - outputs_coord = outputs_coord_logits.sigmoid() - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - outputs_class = torch.stack(outputs_classes) - outputs_coord = torch.stack(outputs_coords) + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) - logits = outputs_class[-1] - pred_boxes = outputs_coord[-1] +SPECIAL_TOKENS = [101, 102, 1012, 1029] - loss, loss_dict, auxiliary_outputs = None, None, None - if labels is not None: - # First: create the matcher - matcher = GroundingDinoHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = GroundingDinoLoss( - matcher=matcher, - num_classes=self.config.num_labels, - focal_alpha=self.config.focal_alpha, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - if self.config.auxiliary_loss: - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - if self.config.two_stage: - enc_outputs_coord = outputs[-1].sigmoid() - outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord} - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + Returns: + `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: + - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) + - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) + """ + batch_size, num_token = input_ids.shape + # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask |= input_ids == special_token - if not return_dict: - if auxiliary_outputs is not None: - output = (logits, pred_boxes) + auxiliary_outputs + outputs - else: - output = (logits, pred_boxes) + outputs - tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) - return tuple_outputs + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) + position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) - dict_outputs = GroundingDinoObjectDetectionOutput( - loss=loss, - loss_dict=loss_dict, - logits=logits, - pred_boxes=pred_boxes, - auxiliary_outputs=auxiliary_outputs, - last_hidden_state=outputs.last_hidden_state, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, - encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, - encoder_hidden_states_vision=outputs.encoder_hidden_states_vision, - encoder_hidden_states_text=outputs.encoder_hidden_states_text, - encoder_attentions=outputs.encoder_attentions, - intermediate_hidden_states=outputs.intermediate_hidden_states, - intermediate_reference_points=outputs.intermediate_reference_points, - init_reference_points=outputs.init_reference_points, - enc_outputs_class=outputs.enc_outputs_class, - enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, - ) + previous_col = col - return dict_outputs + return attention_mask, position_ids.to(torch.long) -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): +@add_start_docstrings( """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + GROUNDING_DINO_START_DOCSTRING, +) +class GroundingDinoModel(GroundingDinoPreTrainedModel): + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + # Create backbone + positional encoding + backbone = GroundingDinoConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = GroundingDinoConvModel(backbone, position_embeddings) -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj_vision = nn.ModuleList(input_proj_list) + else: + self.input_proj_vision = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + # Create text backbone + self.text_backbone = GroundingDinoTextPrenet(config.text_backbone_config) + self.text_projection = nn.Linear(config.text_backbone_config.hidden_size, config.d_model) - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss + self.encoder = GroundingDinoEncoder(config) + self.decoder = GroundingDinoDecoder(config) - return loss.mean(1).sum() / num_boxes + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + if ( + config.two_stage_bbox_embed_share + and config.decoder_bbox_embed_share + and self.decoder.bbox_embed is not None + ): + self.encoder_output_bbox_embed = self.decoder.bbox_embed + else: + self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino -class GroundingDinoLoss(nn.Module): - """ - This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we - compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of - matched ground-truth / prediction (supervise class and box). + self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config) + else: + self.reference_points = nn.Embedding(config.num_queries, 4) - Args: - matcher (`GroundingDinoHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - focal_alpha (`float`): - Alpha parameter in focal loss. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ + self.post_init() - def __init__(self, matcher, num_classes, focal_alpha, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.focal_alpha = focal_alpha - self.losses = losses + def get_encoder(self): + return self.encoder - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor - of dim [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] + def get_decoder(self): + return self.decoder - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) - target_classes_onehot = torch.zeros( - [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], - dtype=source_logits.dtype, - layout=source_logits.layout, - device=source_logits.device, - ) - target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) - target_classes_onehot = target_classes_onehot[:, :, :-1] - loss_ce = ( - sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) - * source_logits.shape[1] - ) - losses = {"loss_ce": loss_ce} + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" - return losses + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_heigth = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) + return valid_ratio - @torch.no_grad() - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + Args: + enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. + padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. + spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + current_position = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)].view( + batch_size, height, width, 1 + ) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + current_position += height * width - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals - # Compute the average number of target boxes accross all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - # (Niels): comment out function below, distributed training to be added - # if is_dist_avail_and_initialized(): - # torch.distributed.all_reduce(num_boxes) - # (Niels) in original implementation, num_boxes is divided by get_world_size() - num_boxes = torch.clamp(num_boxes, min=1).item() + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Tensor, + input_ids: Tensor, + token_type_ids: Tensor = None, + attention_mask: Tensor = None, + pixel_mask: Optional[Tensor] = None, + encoder_outputs=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + Examples: - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) + ```python + >>> from transformers import AutoProcessor, GroundingDinoModel + >>> from PIL import Image + >>> import requests - if "enc_outputs" in outputs: - enc_outputs = outputs["enc_outputs"] - bin_targets = copy.deepcopy(targets) - for bt in bin_targets: - bt["class_labels"] = torch.zeros_like(bt["class_labels"]) - indices = self.matcher(enc_outputs, bin_targets) - for loss in self.losses: - l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) - l_dict = {k + "_enc": v for k, v in l_dict.items()} - losses.update(l_dict) + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." - return losses + >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) -# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead -class GroundingDinoMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 900, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) - """ + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino -class GroundingDinoHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. + # Extract text features from text backbone + text_features = self.text_backbone(input_ids, text_self_attention_masks, token_type_ids, position_ids)[ + "last_hidden_state" + ] + text_features = self.text_projection(text_features) - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(vision_features): + sources.append(self.input_proj_vision[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj_vision[level](vision_features[-1][0]) + else: + source = self.input_proj_vision[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] + # Create queries + query_embeds = None + if self.config.embedding_init_target or self.config.two_stage: + query_embeds = self.query_position_embeddings.weight - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. - alpha = 0.25 - gamma = 2.0 - neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) - pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) - class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + vision_features=source_flatten, + vision_attention_mask=~mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=~text_token_mask, + text_position_embedding=None, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput): + encoder_outputs = GroundingDinoEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, + attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None, + ) - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + # Fifth, prepare decoder inputs + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + # hack implementation for two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.encoder_output_class_embed( + object_query_embedding, encoder_outputs[1], text_token_mask + ) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + # only keep top scoring `config.num_queries` proposals + topk = self.config.num_queries + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + if query_embeds is not None: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() + else: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() + init_reference_points = reference_points + decoder_outputs = self.decoder( + inputs_embeds=target, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=~text_token_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + self_attn_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + return tuple_outputs -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + return GroundingDinoModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, + encoder_hidden_states_text=encoder_outputs.hidden_states_text, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. +@add_start_docstrings( """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. + """, + GROUNDING_DINO_START_DOCSTRING, +) +class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*"] -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + # Deformable DETR encoder-decoder model + self.model = GroundingDinoModel(config) - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + # Detection heads on top + _class_embed = GroundingDinoContrastiveEmbedding(config) + _bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) - union = area1[:, None] + area2 - inter + if config.decoder_bbox_embed_share: + self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) + else: + self.bbox_embed = _get_clones(_bbox_embed, config.decoder_layers) + self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) + # hack implementation for two-stage + self.model.decoder.bbox_embed = self.bbox_embed + self.model.decoder.class_embed = self.class_embed - iou = inter / union - return iou, union + # Initialize weights and apply final processing + self.post_init() + # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + attention_mask: torch.LongTensor = None, + token_type_ids: torch.LongTensor = None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) + Returns: - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + Examples: - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] + ```python + >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection + >>> from PIL import Image + >>> import requests - return iou - (area - union) / area + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." + >>> processor = AutoProcessor.from_pretrained("EduardoPacheco/grounding-dino-tiny") + >>> model = GroundingDinoForObjectDetection.from_pretrained("EduardoPacheco/grounding-dino-tiny") -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = processor.image_processor.post_process_object_detection( + ... outputs, threshold=0.35, target_sizes=target_sizes + ... )[0] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}") + Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83] + Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor(object): - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) - def decompose(self): - return self.tensors, self.mask + # index for encoder_last_hidden_state_text + idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) - def __repr__(self): - return str(self.tensors) + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=attention_mask.bool(), + ) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] -# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText -class GroundingDinoTextEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings.""" + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = GroundingDinoHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = GroundingDinoLoss( + matcher=matcher, + num_classes=self.config.num_labels, + focal_alpha=self.config.focal_alpha, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + if self.config.two_stage: + enc_outputs_coord = outputs[-1].sigmoid() + outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord} - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer( - "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False - ) - self.register_buffer( - "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False - ) + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - token_type_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - past_key_values_length: int = 0, - ) -> torch.Tensor: - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] + return tuple_outputs - seq_length = input_shape[1] + dict_outputs = GroundingDinoObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_hidden_states_vision=outputs.encoder_hidden_states_vision, + encoder_hidden_states_text=outputs.encoder_hidden_states_text, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) - if position_ids is None: - position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + return dict_outputs - # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs - # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves - # issue #5664 - if token_type_ids is None: - if hasattr(self, "token_type_ids"): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) +# Copied from transformers.models.detr.modeling_detr.dice_loss +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes -# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->GroundingDinoText -class GroundingDinoTextSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " - f"heads ({config.num_attention_heads})" - ) +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss - self.is_decoder = config.is_decoder + return loss.mean(1).sum() / num_boxes - def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - mixed_query_layer = self.query(hidden_states) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino +class GroundingDinoLoss(nn.Module): + """ + This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we + compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of + matched ground-truth / prediction (supervise class and box). - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None + Args: + matcher (`GroundingDinoHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + focal_alpha (`float`): + Alpha parameter in focal loss. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_layer = past_key_value[0] - value_layer = past_key_value[1] - attention_mask = encoder_attention_mask - elif is_cross_attention: - key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - elif past_key_value is not None: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - else: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) + def __init__(self, matcher, num_classes, focal_alpha, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses - query_layer = self.transpose_for_scores(mixed_query_layer) + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] - use_cache = past_key_value is not None - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_layer, value_layer) + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + return losses - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + @torch.no_grad() + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in GroundingDinoTextModel forward() function) - attention_scores = attention_scores + attention_mask + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - context_layer = torch.matmul(attention_probs, value_layer) + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx - if self.is_decoder: - outputs = outputs + (past_key_value,) - return outputs + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) -# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GroundingDinoText -class GroundingDinoTextSelfOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) + def forward(self, outputs, targets): + """ + This performs the loss computation. - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) -# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->GroundingDinoText -class GroundingDinoTextAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): - super().__init__() - self.self = GroundingDinoTextSelfAttention(config, position_embedding_type=position_embedding_type) - self.output = GroundingDinoTextSelfOutput(config) - self.pruned_heads = set() + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # (Niels): comment out function below, distributed training to be added + # if is_dist_avail_and_initialized(): + # torch.distributed.all_reduce(num_boxes) + # (Niels) in original implementation, num_boxes is divided by get_world_size() + num_boxes = torch.clamp(num_boxes, min=1).item() - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads - ) + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len(heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) + if "enc_outputs" in outputs: + enc_outputs = outputs["enc_outputs"] + bin_targets = copy.deepcopy(targets) + for bt in bin_targets: + bt["class_labels"] = torch.zeros_like(bt["class_labels"]) + indices = self.matcher(enc_outputs, bin_targets) + for loss in self.losses: + l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) + l_dict = {k + "_enc": v for k, v in l_dict.items()} + losses.update(l_dict) - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - self_outputs = self.self( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs + return losses -# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GroundingDinoText -class GroundingDinoTextIntermediate(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class GroundingDinoMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + """ -# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GroundingDinoText -class GroundingDinoTextOutput(nn.Module): - def __init__(self, config): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x -# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->GroundingDinoText -class GroundingDinoTextLayer(nn.Module): - def __init__(self, config): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - self.attention = GroundingDinoTextAttention(config) - self.is_decoder = config.is_decoder - self.add_cross_attention = config.add_cross_attention - if self.add_cross_attention: - if not self.is_decoder: - raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = GroundingDinoTextAttention(config, position_embedding_type="absolute") - self.intermediate = GroundingDinoTextIntermediate(config) - self.output = GroundingDinoTextOutput(config) - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None - self_attention_outputs = self.attention( - hidden_states, - attention_mask, - head_mask, - output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - ) - attention_output = self_attention_outputs[0] +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino +class GroundingDinoHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. - # if decoder, the last output is tuple of self-attn cache - if self.is_decoder: - outputs = self_attention_outputs[1:-1] - present_key_value = self_attention_outputs[-1] - else: - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). - cross_attn_present_key_value = None - if self.is_decoder and encoder_hidden_states is not None: - if not hasattr(self, "crossattention"): - raise ValueError( - f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" - " by setting `config.add_cross_attention=True`" - ) + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ - # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple - cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - cross_attn_past_key_value, - output_attentions, - ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) - # add cross-attn cache to positions 3,4 of present_key_value tuple - cross_attn_present_key_value = cross_attention_outputs[-1] - present_key_value = present_key_value + cross_attn_present_key_value + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") - layer_output = apply_chunking_to_forward( - self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output - ) - outputs = (layer_output,) + outputs + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - # if decoder, return the attn key/values as the last output - if self.is_decoder: - outputs = outputs + (present_key_value,) + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] - return outputs + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - def feed_forward_chunk(self, attention_output): - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] -# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->GroundingDinoText -class GroundingDinoTextEncoder(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.layer = nn.ModuleList([GroundingDinoTextLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = False, - output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - next_decoder_cache = () if use_cache else None - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[i] if past_key_values is not None else None - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - layer_module.__call__, - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) +# Copied from transformers.models.detr.modeling_detr._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() - hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[-1],) - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + (layer_outputs[2],) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) +# Copied from transformers.models.detr.modeling_detr.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_decoder_cache, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) -# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->GroundingDinoText -class GroundingDinoTextPooler(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output +# Copied from transformers.models.detr.modeling_detr.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] -class GroundingDinoTextPrenet(GroundingDinoPreTrainedModel): - config_class = GroundingDinoTextConfig + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - def __init__(self, config, add_pooling_layer=True): - super().__init__(config) - self.config = config + union = area1[:, None] + area2 - inter - self.embeddings = GroundingDinoTextEmbeddings(config) - self.encoder = GroundingDinoTextEncoder(config) + iou = inter / union + return iou, union - self.pooler = GroundingDinoTextPooler(config) if add_pooling_layer else None - # Initialize weights and apply final processing - self.post_init() +# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - def get_input_embeddings(self): - return self.embeddings.word_embeddings + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return iou - (area - union) / area - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device +# Copied from transformers.models.detr.modeling_detr._max_by_axis +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes - if attention_mask is None: - attention_mask = torch.ones(((batch_size, seq_length)), device=device) - if token_type_ids is None: - if hasattr(self.embeddings, "token_type_ids"): - buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) +# Copied from transformers.models.detr.modeling_detr.NestedTensor +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + def decompose(self): + return self.tensors, self.mask - embedding_output = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - ) - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + def __repr__(self): + return str(self.tensors) - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) +# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) From 06ba0ecdf65223c6dc0356dcba24a4adc6ecc654 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Sun, 4 Feb 2024 01:18:27 +0100 Subject: [PATCH 161/252] Fixed processing to avoid messing with inputs --- .../models/grounding_dino/processing_grounding_dino.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 0e658a42f77baa..7e6d2a2b29f9d2 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -40,6 +40,9 @@ def get_phrases_from_posmap(posmaps, input_ids): left_idx = 0 right_idx = 255 + # Avoiding altering the input tensor + posmaps = posmaps.clone() + posmaps[:, 0 : left_idx + 1] = False posmaps[:, right_idx:] = False From 9cda12eba36e84add9d82f305c17dccb80aaef3d Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Sun, 4 Feb 2024 02:26:47 +0100 Subject: [PATCH 162/252] Added more tips for GroundingDino --- docs/source/en/model_doc/grounding-dino.md | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index f3ccc78ad5c876..2c6bbf735cd0eb 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -27,6 +27,40 @@ The abstract from the paper is the following: Tips: - One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model. +- To separate classes in the text use a period e.g. "a cat. a dog." +- When using multiple classes use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs + +```python +import requests + +import torch +from PIL import Image +from transformers import AutoModelForObjectDetection, AutoProcessor + +model_id = "EduardoPacheco/grounding-dino-tiny" + +model = AutoModelForObjectDetection.from_pretrained(model_id).to(device) +processor = AutoProcessor.from_pretrained(model_id) + +def load_image(url): + return Image.open(requests.get(url, stream=True).raw) + +image = load_image('http://images.cocodataset.org/val2017/000000039769.jpg') +# Check for cats and remote controls +text = "a cat. a remote control" + +inputs = processor(images=image, text=text, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs) + +results = processor.post_process_grounded_object_detection( + outputs, + inputs.input_ids, + bbox_threshold=0.4 + text_threshold=0.3, + target_sizes=[image.size[::-1]] +) +``` drawing @@ -46,6 +80,7 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding ## GroundingDinoProcessor [[autodoc]] GroundingDinoProcessor + - post_process_grounded_object_detection ## GroundingDinoTextConfig From bde2c6a6d4137e3f11df525c817abc05b74d0ece Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Sun, 4 Feb 2024 02:27:02 +0100 Subject: [PATCH 163/252] Make style --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 7edaa5dbbfb827..ab6a2e9df1c1e9 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2069,7 +2069,6 @@ def custom_forward(*inputs): ) - # Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->GroundingDinoText class GroundingDinoTextEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" @@ -2672,6 +2671,7 @@ def forward( attentions=encoder_outputs.attentions, ) + SPECIAL_TOKENS = [101, 102, 1012, 1029] From 01c382e7544e658c4ce7bb0ecdd11c552f700d81 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Sun, 4 Feb 2024 02:30:22 +0100 Subject: [PATCH 164/252] Chaning name to align with SAM --- .../grounding_dino/modeling_grounding_dino.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index ab6a2e9df1c1e9..c2cd8aa74ac454 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -206,11 +206,11 @@ class GroundingDinoEncoderOutput(ModelOutput): Sequence of hidden-states at the output of the last layer of the vision encoder. last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the text encoder. - hidden_states_vision (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the output of each layer plus the initial embedding outputs. - hidden_states_text (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of each layer plus the initial embedding outputs. @@ -223,8 +223,8 @@ class GroundingDinoEncoderOutput(ModelOutput): last_hidden_state_vision: torch.FloatTensor = None last_hidden_state_text: torch.FloatTensor = None - hidden_states_vision: Optional[Tuple[torch.FloatTensor]] = None - hidden_states_text: Optional[Tuple[torch.FloatTensor]] = None + vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None @@ -1626,7 +1626,7 @@ def _set_gradient_checkpointing(self, module, value=False): encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: - `hidden_states_vision`, *optional*: `hidden_states_text`, *optional*: `attentions`) + `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`) `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. @@ -1803,8 +1803,8 @@ def forward( return GroundingDinoEncoderOutput( last_hidden_state_vision=vision_features, last_hidden_state_text=text_features, - hidden_states_vision=encoder_vision_states, - hidden_states_text=encoder_text_states, + vision_hidden_states=encoder_vision_states, + text_hidden_states=encoder_text_states, attentions=all_attns, ) @@ -3042,8 +3042,8 @@ def forward( encoder_outputs = GroundingDinoEncoderOutput( last_hidden_state_vision=encoder_outputs[0], last_hidden_state_text=encoder_outputs[1], - hidden_states_vision=encoder_outputs[2] if len(encoder_outputs) > 2 else None, - hidden_states_text=encoder_outputs[3] if len(encoder_outputs) > 3 else None, + vision_hidden_states=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + text_hidden_states=encoder_outputs[3] if len(encoder_outputs) > 3 else None, attentions=encoder_outputs[4] if len(encoder_outputs) > 4 else None, ) @@ -3118,8 +3118,8 @@ def forward( decoder_attentions=decoder_outputs.attentions, encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, - encoder_hidden_states_vision=encoder_outputs.hidden_states_vision, - encoder_hidden_states_text=encoder_outputs.hidden_states_text, + encoder_hidden_states_vision=encoder_outputs.vision_hidden_states, + encoder_hidden_states_text=encoder_outputs.text_hidden_states, encoder_attentions=encoder_outputs.attentions, enc_outputs_class=enc_outputs_class, enc_outputs_coord_logits=enc_outputs_coord_logits, From 5d1f0e77c1b96a2820722ae49bed1e67b8c62954 Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 4 Feb 2024 13:17:34 +0100 Subject: [PATCH 165/252] Replace final nn.multiheadattention --- .../convert_grounding_dino_to_hf.py | 28 +++++++++++++++++++ .../grounding_dino/modeling_grounding_dino.py | 27 ++++++++++++------ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py index 3d9b7673fbef38..8af08626022183 100644 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -313,6 +313,33 @@ def read_in_q_k_v_encoder(state_dict, config): ########################################## VISION BACKBONE - END +def read_in_q_k_v_text_enhancer(state_dict, config): + hidden_size = config.hidden_size + for idx in range(config.encoder_layers): + # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[ + :hidden_size, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[ + -hidden_size:, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[ + -hidden_size: + ] + + def read_in_q_k_v_decoder(state_dict, config): hidden_size = config.hidden_size for idx in range(config.decoder_layers): @@ -393,6 +420,7 @@ def convert_grounding_dino_checkpoint(args): for src, dest in rename_keys: rename_key(new_state_dict, src, dest) read_in_q_k_v_encoder(new_state_dict, config) + read_in_q_k_v_text_enhancer(new_state_dict, config) read_in_q_k_v_decoder(new_state_dict, config) # Load HF model diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index f99ec9ab2c9717..add89a2927ecbd 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -766,12 +766,10 @@ class GroundingDinoTextEnhancerLayer(nn.Module): def __init__(self, config): super().__init__() - self.self_attn = nn.MultiheadAttention( - embed_dim=config.d_model, - num_heads=config.encoder_attention_heads // 2, - dropout=config.text_enhancer_dropout, - batch_first=True, - ) + mha_config = copy.deepcopy(config) + mha_config.num_attention_heads = config.encoder_attention_heads // 2 + self.self_attn = GroundingDinoMultiheadAttention(mha_config) + # Implementation of Feedforward model self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) @@ -814,12 +812,23 @@ def forward( # repeat attn mask if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: - # bs, num_q, num_k - attention_masks = attention_masks.repeat(self.num_heads, 1, 1) + # batch_size, num_queries, num_keys + # TODO we shouldn't switch the attention mask here + attention_masks = ~attention_masks + attention_masks = attention_masks[:, None, :, :] + attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1) + + dtype = torch.float16 + attention_masks = attention_masks.to(dtype=dtype) # fp16 compatibility + attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min queries = keys = self.with_pos_embed(hidden_states, position_embeddings) attention_output, attention_weights = self.self_attn( - query=queries, key=keys, value=hidden_states, attn_mask=attention_masks, average_attn_weights=False + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=attention_masks, + output_attentions=True, ) attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) hidden_states = hidden_states + attention_output From 339915f1d7c8896f3c9191f6c3380cd02da0d09e Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 4 Feb 2024 14:41:55 +0100 Subject: [PATCH 166/252] Fix model tests --- .../models/grounding_dino/modeling_grounding_dino.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index add89a2927ecbd..cbf63b6e7fcce2 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1417,11 +1417,12 @@ def forward( # Cross-Attention Text queries = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, text_cross_attn_weights = self.encoder_attn_text( queries=queries, keys=text_encoder_hidden_states, values=text_encoder_hidden_states, - attention_mask=text_encoder_attention_mask, + # attention_mask=text_encoder_attention_mask, # TODO fix cross-attention mask here output_attentions=True, ) From 1bb488639ea31d26b74782fa3edc888e5ce81ee1 Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 4 Feb 2024 14:47:25 +0100 Subject: [PATCH 167/252] Update year, remove GenerationTesterMixin --- docs/source/en/model_doc/grounding-dino.md | 2 +- src/transformers/models/grounding_dino/__init__.py | 2 +- .../models/grounding_dino/configuration_grounding_dino.py | 2 +- .../models/grounding_dino/convert_grounding_dino_to_hf.py | 2 +- .../models/grounding_dino/image_processing_grounding_dino.py | 2 +- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- .../models/grounding_dino/processing_grounding_dino.py | 2 +- .../grounding_dino/test_image_processing_grounding_dino.py | 2 +- tests/models/grounding_dino/test_modeling_grounding_dino.py | 5 ++--- tests/models/grounding_dino/test_processor_grounding_dino.py | 2 +- 10 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index f3ccc78ad5c876..cc431ef448cc3f 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -1,4 +1,4 @@ -