From 2986dc21201fe1a687badd62d2be667d6b335ffe Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 3 Nov 2024 10:48:55 +0500 Subject: [PATCH 01/72] implement config and model building blocks --- .../depth_pro/configuration_depth_pro.py | 167 ++ .../models/depth_pro/modeling_depth_pro.py | 1404 +++++++++++++++++ 2 files changed, 1571 insertions(+) create mode 100644 src/transformers/models/depth_pro/configuration_depth_pro.py create mode 100644 src/transformers/models/depth_pro/modeling_depth_pro.py diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py new file mode 100644 index 00000000000000..ad0f1016f7a147 --- /dev/null +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthPro model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig +from transformers.utils import logging +from transformers.utils.backbone_utils import get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class DepthProConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a + DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DepthPro + [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import DepthProConfig, DepthProModel + + >>> # Initializing a DepthPro apple/DepthPro style configuration + >>> configuration = DepthProConfig() + + >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration + >>> model = DepthProModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_pro" + + def __init__( + self, + hidden_size=1024, # changed + decoder_hidden_size=256, + num_hidden_layers=24, # changed + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=384, + patch_size=16, # changed + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + patch_encoder_hook_ids = [5, 11], + # patch_encoder_hook_ids = [5, 11, 17, 23], + patch_encoder_feature_dims = [256, 512, 1024, 1024], + use_batch_norm_in_decoder=False, + use_fov=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.decoder_hidden_size = decoder_hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.patch_encoder_hook_ids = patch_encoder_hook_ids + self.patch_encoder_feature_dims = patch_encoder_feature_dims + self.use_batch_norm_in_decoder = use_batch_norm_in_decoder + self.use_fov = use_fov diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py new file mode 100644 index 00000000000000..f73b74060f5778 --- /dev/null +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -0,0 +1,1404 @@ +# coding=utf-8 +# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DepthPro model.""" + +from icecream import ic + +import collections.abc +import math +from typing import Dict, List, Optional, Set, Tuple, Union + +import torch +from torch import nn +from dataclasses import dataclass + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, +) +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from .configuration_depth_pro import DepthProConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT +class DepthProViTPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings +# with DepthProViT->DepthProViT and antialias=True in interpolation +class DepthProViTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + """ + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.patch_embeddings = DepthProViTPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + target_dtype = patch_pos_embed.dtype + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(torch.float32), + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings + ).to(dtype=target_dtype) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT +class DepthProViTSelfAttention(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size,} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT +class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): + def __init__(self, config: DepthProConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT +class DepthProViTSelfOutput(nn.Module): + """ + The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT +class DepthProViTAttention(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.attention = DepthProViTSelfAttention(config) + self.output = DepthProViTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT +class DepthProViTSdpaAttention(DepthProViTAttention): + def __init__(self, config: DepthProConfig) -> None: + super().__init__(config) + self.attention = DepthProViTSdpaSelfAttention(config) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT +class DepthProViTLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath +class DepthProViTDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT +class DepthProViTMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT +class DepthProViTSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +DEPTHPROVIT_ATTENTION_CLASSES = { + "eager": DepthProViTAttention, + "sdpa": DepthProViTSdpaAttention, +} + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT +class DepthProViTLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DEPTHPROVIT_ATTENTION_CLASSES[config._attn_implementation](config) + self.layer_scale1 = DepthProViTLayerScale(config) + self.drop_path = DepthProViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = DepthProViTSwiGLUFFN(config) + else: + self.mlp = DepthProViTMLP(config) + self.layer_scale2 = DepthProViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in DepthProViT, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in DepthProViT, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT +class DepthProViTEncoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class DepthProViT(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.embeddings = DepthProViTEmbeddings(config) + self.encoder = DepthProViTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + + self.out_size = 24 # TODO: image_size // patch_size + + # patch encoder + self.patch_encoder = DepthProViT(config) + self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook( + self._intermediate0_hook + ) + self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook( + self._intermediate1_hook + ) + + # image encoder + self.image_encoder = DepthProViT(config) + + # upsampling features (1-2) + self.upsample_intermediate0 = self._create_project_upsample_block( + input_dims=config.hidden_size, + intermediate_dims=config.patch_encoder_feature_dims[0], + output_dims=config.decoder_hidden_size, + n_upsample_layers=3, + ) + self.upsample_intermediate1 = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[0], + n_upsample_layers=2, + ) + + # upsampling features (3-5) + self.upsample_high_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[1], + n_upsample_layers=1, + ) + self.upsample_med_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[2], + n_upsample_layers=1, + ) + self.upsample_low_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[3], + n_upsample_layers=1, + ) + + # upsampling features (6) + self.upsample_image = nn.ConvTranspose2d( + in_channels=config.hidden_size, + out_channels=config.patch_encoder_feature_dims[3], + kernel_size=2, + stride=2, + padding=0, + bias=True, + ) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]), + out_channels=config.patch_encoder_feature_dims[3], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + + def _intermediate0_hook(self, model, input, output): + self.intermediate0_hidden_states = output[0] + + def _intermediate1_hook(self, model, input, output): + self.intermediate1_hidden_states = output[0] + + def _create_project_upsample_block( + self, + input_dims: int, + output_dims: int, + n_upsample_layers: int, + intermediate_dims: Optional[int] = None, + ) -> nn.Module: + + intermediate_dims = intermediate_dims or output_dims + + # Projection block followed by upsampling blocks. + blocks = [ + nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False) + ] + [ + nn.ConvTranspose2d( + in_channels=(intermediate_dims if i == 0 else output_dims), + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=False + ) for i in range(n_upsample_layers) + ] + + return nn.Sequential(*blocks) + + def _interpolate(self, pixel_values, scale_factor): + return nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=scale_factor, + mode="bilinear", + align_corners=False, + ) + + def _patch(self, pixel_values, overlap_ratio): + patch_size = 384 # TODO: this should be infered + patch_stride = int(patch_size * (1 - overlap_ratio)) + + image_size = pixel_values.shape[-1] + steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1 + + x_patch_list = [] + for j in range(steps): + j0 = j * patch_stride + j1 = j0 + patch_size + + for i in range(steps): + i0 = i * patch_stride + i1 = i0 + patch_size + x_patch_list.append(pixel_values[..., j0:j1, i0:i1]) + + return torch.cat(x_patch_list, dim=0) + + def _reshape_feature( + self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 + ): + """Discard class token and reshape 1D feature map to a 2D grid.""" + b, hw, c = hidden_states.shape + + # Remove class token. + if cls_token_offset > 0: + hidden_states = hidden_states[:, cls_token_offset:, :] + + # Shape: (batch, height, width, dim) -> (batch, dim, height, width) + hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) + return hidden_states + + def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor: + """Merge the patched input into a image with sliding window.""" + steps = int(math.sqrt(x.shape[0] // batch_size)) + + idx = 0 + + output_list = [] + for j in range(steps): + output_row_list = [] + for i in range(steps): + output = x[batch_size * idx : batch_size * (idx + 1)] + + if j != 0: + output = output[..., padding:, :] + if i != 0: + output = output[..., :, padding:] + if j != steps - 1: + output = output[..., :-padding, :] + if i != steps - 1: + output = output[..., :, :-padding] + + output_row_list.append(output) + idx += 1 + + output_row = torch.cat(output_row_list, dim=-1) + output_list.append(output_row) + output = torch.cat(output_list, dim=-2) + return output + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.shape[0] + + # STEP 1: create 3-level image + + high_res = pixel_values + med_res = self._interpolate(pixel_values, 0.5) + low_res = self._interpolate(pixel_values, 0.25) + + # STEP 2: create patches + + high_res_patches = self._patch(high_res, 0.25) + med_res_patches = self._patch(med_res, 0.5) + low_res_patches = low_res + + patches = torch.cat( + (high_res_patches, med_res_patches, low_res_patches), + dim=0, + ) + + # STEP 3: apply patch encoder + + patch_encodings = self.patch_encoder( + patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + patch_features = patch_encodings[0] + patch_features = self._reshape_feature( + patch_features, self.out_size, self.out_size + ) + + # STEP 4: Get Intermediate Features (features 1 and 2) + + intermediate0_features = self._reshape_feature( + self.intermediate0_hidden_states, + self.out_size, + self.out_size, + ) + intermediate1_features = self._reshape_feature( + self.intermediate1_hidden_states, + self.out_size, + self.out_size, + ) + intermediate0_features = self._merge( + intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + intermediate1_features = self._merge( + intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + + # STEP 5: Get Patch Encoder Features (features 3-5) + + high_res_features, med_res_features, low_res_features = torch.split( + patch_features, + [len(high_res_patches), len(med_res_patches), len(low_res_patches)], + dim=0, + ) + + high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) + med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) + low_res_features = low_res_features + + # STEP 6: Get Image Encoder Features (features 6) + + image_encodings = self.image_encoder( + pixel_values=low_res_patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + image_features = image_encodings[0] + image_features = self._reshape_feature( + image_features, self.out_size, self.out_size + ) + + # STEP 7: Upsample All Features (feature 1-6) + + # feature (1-2) + intermediate0_features = self.upsample_intermediate0( + intermediate0_features + ) + intermediate1_features = self.upsample_intermediate1( + intermediate1_features + ) + + # feature (3-5) + high_res_features = self.upsample_high_res(high_res_features) + med_res_features = self.upsample_med_res(med_res_features) + low_res_features = self.upsample_low_res(low_res_features) + + # feature (6) + image_features = self.upsample_image(image_features) + image_features = self.fuse_image_with_low_res( + torch.cat((low_res_features, image_features), dim=1) + ) + + last_hidden_state = [ + intermediate0_features, + intermediate1_features, + high_res_features, + med_res_features, + # low_res_features, + image_features, # fused with low_res_features + ] + + hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None + attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None + + if not return_dict: + return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=hidden_states, + attentions=attentions, + ) + + +class DepthProFOVModel(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size + + self.encoder = DepthProViT(config) + self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.low_res_neck = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(True) + ) + self.head = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + ) + + def forward( + self, + pixel_values: torch.Tensor, + low_res_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=0.25, + mode="bilinear", + align_corners=False, + ) + encoder_outputs = self.encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = encoder_outputs[0] + + image_features = self.encoder_neck(image_features) + + # TODO: add some comments + image_features = image_features[:, 1:] + image_features = image_features.permute(0, 2, 1) + + low_res_features = self.low_res_neck(low_res_features) + + image_features = image_features.reshape_as(low_res_features) + image_features = image_features + low_res_features + fov_output = self.head(image_features) + fov_output = fov_output.reshape(1) + + if not return_dict: + head_outputs = (fov_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=fov_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro +class DepthProResidualLayer(nn.Module): + def __init__(self, config): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_decoder + self.hidden_size = config.decoder_hidden_size + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + self.hidden_size, + self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=(not self.use_batch_norm), + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + self.hidden_size, + self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=(not self.use_batch_norm), + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(self.hidden_size) + self.batch_norm2 = nn.BatchNorm2d(self.hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +class DepthProFeatureFusionLayer(nn.Module): + def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: + super().__init__() + self.config = config + self.use_deconv = use_deconv + + self.residual_layer1 = DepthProResidualLayer(config) + self.residual_layer2 = DepthProResidualLayer(config) + + if self.use_deconv: + self.deconv = nn.ConvTranspose2d( + in_channels=config.decoder_hidden_size, + out_channels=config.decoder_hidden_size, + kernel_size=2, + stride=2, + padding=0, + bias=False, + ) + + self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True) + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, hidden_state, residual=None): + if residual is not None: + hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual)) + + hidden_state = self.residual_layer2(hidden_state) + if self.use_deconv: + hidden_state = self.deconv(hidden_state) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage +class DepthProDecoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + + self.hidden_size = config.decoder_hidden_size + self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims + + self.projections = nn.ModuleList() + self.fusions = nn.ModuleList() + for i, dim in enumerate(self.decoder_feature_dims): + + # Projection + if i != 0: + # conv for hidden_states[1:] + projection = nn.Conv2d( + in_channels=dim, + out_channels=self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + elif self.hidden_size != dim: + # first hidden_state with dim differnet from hidden_size + projection = nn.Conv2d( + in_channels=dim, + out_channels=self.hidden_size, + kernel_size=1, + bias=False, + ) + else: + # first hidden_state with dim same as hidden_size + projection = nn.Identity() + self.projections.append(projection) + + # Fusion + fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0)) + self.fusions.append(fusion) + + def forward(self, hidden_states): + + if len(hidden_states) != len(self.decoder_feature_dims): + raise ValueError( + f"Got number of hidden_states = {len(hidden_states)}," + f"expected number of hidden_states = {len(self.decoder_feature_dims)}." + ) + + # first extract the low_res_features + last_features = hidden_states[-1] + last_features = self.projections[-1](last_features) + low_res_features = last_features # required later for fov_encoder + last_features = self.fusions[-1](last_features) + + # now get features through each layer + for i in range(len(hidden_states) - 2, -1, -1): + hidden_state = hidden_states[i] + projection = self.projections[i] + fusion = self.fusions[i] + + projected = projection(hidden_state) + last_features = fusion(last_features, projected) + + return last_features, low_res_features + + +class DepthProPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DepthProViTSwiGLUFFN"] + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +DEPTH_PRO_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_PRO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", + DEPTH_PRO_START_DOCSTRING, +) +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.use_fov = config.use_fov + + # dinov2 (vit) like encoder + self.encoder = DepthProEncoder(config) + # dpt (vit) like decoder + self.decoder = DepthProDecoder(config) + # dinov2 (vit) like encoder + self.fov_model = DepthProFOVModel(config) if self.use_fov else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + embeddings = { + "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, + "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, + } + if self.use_fov: + embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings + return embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # TODO + # @add_code_sample_docstrings( + # checkpoint=_CHECKPOINT_FOR_DOC, + # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, + # config_class=_CONFIG_FOR_DOC, + # modality="vision", + # expected_output=_EXPECTED_OUTPUT_SHAPE, + # ) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encodings = self.encoder( + pixel_values, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + ) + + encodings_last_hidden_state = encodings.last_hidden_state + + for i in range(len(encodings_last_hidden_state)): + ic(encodings_last_hidden_state[i].shape) + + features, low_res_features = self.decoder(encodings_last_hidden_state) + + ic(features.shape) + ic(low_res_features.shape) + # ic(features); exit() + + if self.use_fov: + fov_out = self.fov_model( + pixel_values=pixel_values, + low_res_features=low_res_features.detach(), + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + fov_out = None + + return features, fov_out + + +class DepthProDepthEstimationHead(nn.Module): + """ + # TODO + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the paper's + supplementary material). + """ + + def __init__(self, config): + super().__init__() + self.config = config + + features = config.decoder_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features//2, out_channels=features//2, + kernel_size=2, stride=2, padding=0, bias=True + ), + nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ) + + + def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: + predicted_depth = self.head(hidden_states) + predicted_depth = predicted_depth.squeeze(dim=1) + return predicted_depth + + +@add_start_docstrings( + """ + DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). + """, + DEPTH_PRO_START_DOCSTRING, +) +class DepthProForDepthEstimation(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.depth_pro = DepthProModel(config) + self.head = DepthProDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + ```python + >>> from transformers import AutoImageProcessor, DPTForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large") + >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + ... predicted_depth = outputs.predicted_depth + + >>> # interpolate to original size + >>> prediction = torch.nn.functional.interpolate( + ... predicted_depth.unsqueeze(1), + ... size=image.size[::-1], + ... mode="bicubic", + ... align_corners=False, + ... ) + + >>> # visualize the prediction + >>> output = prediction.squeeze().cpu().numpy() + >>> formatted = (output * 255 / np.max(output)).astype("uint8") + >>> depth = Image.fromarray(formatted) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + outputs = [None] * 4 + + hidden_states, fov_out = self.depth_pro( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + predicted_depth = self.head(hidden_states) + ic(predicted_depth.shape) + ic(fov_out.shape) + + # ic(predicted_depth); exit() + ic(fov_out); exit() + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + # hidden_states=outputs.hidden_states, + # attentions=outputs.attentions, + ) From 1728a2ff687435bc615a8c67d9a4f55baa6ff8d4 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 9 Nov 2024 16:23:06 +0500 Subject: [PATCH 02/72] refactor model architechture --- .../depth_pro/configuration_depth_pro.py | 19 +- .../models/depth_pro/modeling_depth_pro.py | 478 ++++++++++-------- 2 files changed, 288 insertions(+), 209 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index ad0f1016f7a147..7e66e679c67ff1 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -129,9 +129,18 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, + patch_encoder_feature_dims = [256, 512, 1024, 1024], + patch_encoder_hook_ids = [5, 11], # patch_encoder_hook_ids = [5, 11, 17, 23], - patch_encoder_feature_dims = [256, 512, 1024, 1024], + intermediate_feature_dims = [256, 256], + intermediate_upsample_layers = [3, 2], + high_res_feature_dims = 512, + med_res_feature_dims = 1024, + low_res_feature_dims = 1024, + image_feature_dims = 1024, + global_feature_dims = 1024, + use_batch_norm_in_decoder=False, use_fov=False, **kwargs, @@ -165,3 +174,11 @@ def __init__( self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov = use_fov + + self.intermediate_feature_dims = intermediate_feature_dims + self.intermediate_upsample_layers = intermediate_upsample_layers + self.high_res_feature_dims = high_res_feature_dims + self.med_res_feature_dims = med_res_feature_dims + self.low_res_feature_dims = low_res_feature_dims + self.image_feature_dims = image_feature_dims + self.global_feature_dims = global_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f73b74060f5778..74669bc4e55753 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -568,105 +568,112 @@ def forward( ) +class DepthProUpsampleBlock(nn.Module): + def __init__( + self, + input_dims, + intermediate_dims, + output_dims, + n_upsample_layers, + use_proj=True, + bias=False, + ) -> None: + super().__init__() + + # create first projection block + if use_proj: + self.proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + else: + self.proj = nn.Identity() + + # create following upsample blocks + self.upsample_blocks = nn.Sequential() + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.upsample_blocks.append(layer) + + def forward(self, features): + projected = self.proj(features) + return self.upsample_blocks(projected) + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size + self.patch_encoder_hook_ids = config.patch_encoder_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.intermediate_upsample_layers = config.intermediate_upsample_layers + self.out_size = 24 # TODO: image_size // patch_size # patch encoder self.patch_encoder = DepthProViT(config) - self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook( - self._intermediate0_hook - ) - self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook( - self._intermediate1_hook - ) # image encoder self.image_encoder = DepthProViT(config) - # upsampling features (1-2) - self.upsample_intermediate0 = self._create_project_upsample_block( - input_dims=config.hidden_size, - intermediate_dims=config.patch_encoder_feature_dims[0], - output_dims=config.decoder_hidden_size, - n_upsample_layers=3, - ) - self.upsample_intermediate1 = self._create_project_upsample_block( - input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[0], - n_upsample_layers=2, - ) + # upsampling intermediate features - (1-2) in diagram + self.upsample_intermediate = nn.ModuleList() + for i, (feature_dims, upsample_layers) in enumerate(zip( + self.intermediate_feature_dims, + self.intermediate_upsample_layers, + )): + intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims + upsample_block = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=upsample_layers, + ) + self.upsample_intermediate.append(upsample_block) - # upsampling features (3-5) - self.upsample_high_res = self._create_project_upsample_block( + # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram + self.upsample_high_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[1], + intermediate_dims=config.high_res_feature_dims, + output_dims=config.high_res_feature_dims, n_upsample_layers=1, ) - self.upsample_med_res = self._create_project_upsample_block( + self.upsample_med_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[2], + intermediate_dims=config.med_res_feature_dims, + output_dims=config.med_res_feature_dims, n_upsample_layers=1, ) - self.upsample_low_res = self._create_project_upsample_block( + self.upsample_low_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[3], + intermediate_dims=config.low_res_feature_dims, + output_dims=config.low_res_feature_dims, n_upsample_layers=1, ) - # upsampling features (6) - self.upsample_image = nn.ConvTranspose2d( - in_channels=config.hidden_size, - out_channels=config.patch_encoder_feature_dims[3], - kernel_size=2, - stride=2, - padding=0, - bias=True, - ) - self.fuse_image_with_low_res = nn.Conv2d( - in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]), - out_channels=config.patch_encoder_feature_dims[3], - kernel_size=1, - stride=1, - padding=0, + # upsampling image features - (6) in diagram + self.upsample_image = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=config.hidden_size, + output_dims=config.image_feature_dims, + n_upsample_layers=1, + use_proj=False, bias=True, ) - def _intermediate0_hook(self, model, input, output): - self.intermediate0_hidden_states = output[0] - - def _intermediate1_hook(self, model, input, output): - self.intermediate1_hidden_states = output[0] - - def _create_project_upsample_block( - self, - input_dims: int, - output_dims: int, - n_upsample_layers: int, - intermediate_dims: Optional[int] = None, - ) -> nn.Module: - - intermediate_dims = intermediate_dims or output_dims - - # Projection block followed by upsampling blocks. - blocks = [ - nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False) - ] + [ - nn.ConvTranspose2d( - in_channels=(intermediate_dims if i == 0 else output_dims), - out_channels=output_dims, - kernel_size=2, - stride=2, - padding=0, - bias=False - ) for i in range(n_upsample_layers) - ] - - return nn.Sequential(*blocks) - def _interpolate(self, pixel_values, scale_factor): return nn.functional.interpolate( pixel_values, @@ -771,97 +778,100 @@ def forward( dim=0, ) - # STEP 3: apply patch encoder + # STEP 3: apply patch and image encoder patch_encodings = self.patch_encoder( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_hidden_states=True, # required for intermediate features return_dict=True, ) - patch_features = patch_encodings[0] - patch_features = self._reshape_feature( - patch_features, self.out_size, self.out_size + image_encodings = self.image_encoder( + pixel_values=low_res_patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, ) - # STEP 4: Get Intermediate Features (features 1 and 2) - - intermediate0_features = self._reshape_feature( - self.intermediate0_hidden_states, - self.out_size, - self.out_size, - ) - intermediate1_features = self._reshape_feature( - self.intermediate1_hidden_states, - self.out_size, - self.out_size, - ) - intermediate0_features = self._merge( - intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 - ) - intermediate1_features = self._merge( - intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 - ) + # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram - # STEP 5: Get Patch Encoder Features (features 3-5) + # a. extract hidden_state + hidden_state = patch_encodings.last_hidden_state + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, self.out_size, self.out_size + ) high_res_features, med_res_features, low_res_features = torch.split( - patch_features, + features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, ) + # c. merge patches back together high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) - low_res_features = low_res_features + low_res_features = low_res_features # no merge required with low res image - # STEP 6: Get Image Encoder Features (features 6) + # d. upsample + high_res_features = self.upsample_high_res(high_res_features) + med_res_features = self.upsample_med_res(med_res_features) + low_res_features = self.upsample_low_res(low_res_features) - image_encodings = self.image_encoder( - pixel_values=low_res_patches, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - image_features = image_encodings[0] - image_features = self._reshape_feature( - image_features, self.out_size, self.out_size - ) + # STEP 5: get intermediate features - (1-2) in diagram - # STEP 7: Upsample All Features (feature 1-6) + intermediate_features = [] + for layer_id in self.patch_encoder_hook_ids: + + # a. extract hidden_state + hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well - # feature (1-2) - intermediate0_features = self.upsample_intermediate0( - intermediate0_features - ) - intermediate1_features = self.upsample_intermediate1( - intermediate1_features + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, + self.out_size, + self.out_size, + ) + + # c. merge patches back together + features = self._merge( + features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + + # d. upsample + features = self.upsample_intermediate[layer_id](features) + + intermediate_features.append(features) + + # STEP 6: get image features - (6) in diagram + + # a. extract hidden_state + hidden_state = image_encodings.last_hidden_state + + # b. reshape back to image like + image_features = self._reshape_feature( + hidden_state, self.out_size, self.out_size ) - # feature (3-5) - high_res_features = self.upsample_high_res(high_res_features) - med_res_features = self.upsample_med_res(med_res_features) - low_res_features = self.upsample_low_res(low_res_features) + # c. merge patches back together + # skipped, no merge required with low res image - # feature (6) + # d. upsample image_features = self.upsample_image(image_features) - image_features = self.fuse_image_with_low_res( - torch.cat((low_res_features, image_features), dim=1) - ) + # STEP 7: return these features last_hidden_state = [ - intermediate0_features, - intermediate1_features, + *intermediate_features, high_res_features, med_res_features, - # low_res_features, - image_features, # fused with low_res_features + low_res_features, + image_features, ] - hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None - attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None + hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None if not return_dict: return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) @@ -882,7 +892,7 @@ def __init__(self, config: DepthProConfig) -> None: self.encoder = DepthProViT(config) self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) - self.low_res_neck = nn.Sequential( + self.global_neck = nn.Sequential( nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) @@ -897,7 +907,7 @@ def __init__(self, config: DepthProConfig) -> None: def forward( self, pixel_values: torch.Tensor, - low_res_features: torch.Tensor, + global_features: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, @@ -923,19 +933,19 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) - image_features = encoder_outputs[0] + last_hidden_state = encoder_outputs[0] - image_features = self.encoder_neck(image_features) + last_hidden_state = self.encoder_neck(last_hidden_state) # TODO: add some comments - image_features = image_features[:, 1:] - image_features = image_features.permute(0, 2, 1) + last_hidden_state = last_hidden_state[:, 1:] + last_hidden_state = last_hidden_state.permute(0, 2, 1) - low_res_features = self.low_res_neck(low_res_features) + global_features = self.global_neck(global_features) - image_features = image_features.reshape_as(low_res_features) - image_features = image_features + low_res_features - fov_output = self.head(image_features) + last_hidden_state = last_hidden_state.reshape_as(global_features) + last_hidden_state = last_hidden_state + global_features + fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(1) if not return_dict: @@ -1040,65 +1050,126 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - self.hidden_size = config.decoder_hidden_size - self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims - - self.projections = nn.ModuleList() - self.fusions = nn.ModuleList() - for i, dim in enumerate(self.decoder_feature_dims): - - # Projection - if i != 0: - # conv for hidden_states[1:] - projection = nn.Conv2d( - in_channels=dim, - out_channels=self.hidden_size, + # for STEP 2: fuse low_res and image features + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.low_res_feature_dims+config.image_feature_dims, + out_channels=config.global_feature_dims, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + + # for STEP 3: apply decoder block for global features + self.global_proj = nn.Conv2d( + in_channels=config.global_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.global_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 4: apply decoder block for med features + self.med_res_proj = nn.Conv2d( + in_channels=config.med_res_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.med_res_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 5: apply decoder block for high features + self.high_res_proj = nn.Conv2d( + in_channels=config.high_res_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.high_res_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 6: apply decoder block for intermediate features + self.intermediate_proj = nn.Sequential() + self.intermediate_fusion = nn.Sequential() + for i, feature_dim in enumerate(config.intermediate_feature_dims): + if i == 0: + # no projection for final intermediate layer + proj = nn.Identity() + fusion = DepthProFeatureFusionLayer(config, use_deconv=False) + else: + proj = nn.Conv2d( + in_channels=feature_dim, + out_channels=config.decoder_hidden_size, kernel_size=3, stride=1, padding=1, bias=False, ) - elif self.hidden_size != dim: - # first hidden_state with dim differnet from hidden_size - projection = nn.Conv2d( - in_channels=dim, - out_channels=self.hidden_size, - kernel_size=1, - bias=False, - ) - else: - # first hidden_state with dim same as hidden_size - projection = nn.Identity() - self.projections.append(projection) + fusion = DepthProFeatureFusionLayer(config) - # Fusion - fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0)) - self.fusions.append(fusion) + self.intermediate_proj.append(proj) + self.intermediate_fusion.append(fusion) def forward(self, hidden_states): - if len(hidden_states) != len(self.decoder_feature_dims): - raise ValueError( - f"Got number of hidden_states = {len(hidden_states)}," - f"expected number of hidden_states = {len(self.decoder_feature_dims)}." - ) + # STEP 1: extract features - # first extract the low_res_features - last_features = hidden_states[-1] - last_features = self.projections[-1](last_features) - low_res_features = last_features # required later for fov_encoder - last_features = self.fusions[-1](last_features) + intermediate_features = hidden_states[:-4] + # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384] + high_res_features = hidden_states[-4] + # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192] + med_res_features = hidden_states[-3] + # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96] + low_res_features = hidden_states[-2] + # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48] + image_features = hidden_states[-1] + # image_features.shape: [batch_size, config.image_feature_dims, 48, 48] - # now get features through each layer - for i in range(len(hidden_states) - 2, -1, -1): - hidden_state = hidden_states[i] - projection = self.projections[i] - fusion = self.fusions[i] + # STEP 2: fuse low_res and image features - projected = projection(hidden_state) - last_features = fusion(last_features, projected) + global_features = torch.cat((low_res_features, image_features), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + # global_features.shape: [batch_size, config.global_feature_dims, 48, 48] - return last_features, low_res_features + # STEP 3: apply decoder block for global features + + # apply projection: used by fusion now and then fov later + global_projected = self.global_proj(global_features) + # apply fusion: used by next projections and fusions + last_features = self.global_fusion(global_projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96] + + # STEP 4: apply decoder block for med features + + projected = self.med_res_proj(med_res_features) + last_features = self.med_res_fusion(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192] + + # STEP 5: apply decoder block for high features + + projected = self.high_res_proj(high_res_features) + last_features = self.high_res_fusion(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384] + + # STEP 6: apply decoder block for intermediate features + + for (features, proj_layer, fusion_layer) in zip( + # reversed becuase decoding is applied from last features to first features + intermediate_features[::-1], + self.intermediate_proj[::-1], + self.intermediate_fusion[::-1], + ): + projected = proj_layer(features) + last_features = fusion_layer(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] + # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] + + return last_features, global_projected class DepthProPreTrainedModel(PreTrainedModel): @@ -1233,26 +1304,18 @@ def forward( encodings = self.encoder( pixel_values, head_mask, - output_attentions, - output_hidden_states, - return_dict, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, ) - encodings_last_hidden_state = encodings.last_hidden_state - - for i in range(len(encodings_last_hidden_state)): - ic(encodings_last_hidden_state[i].shape) - - features, low_res_features = self.decoder(encodings_last_hidden_state) - - ic(features.shape) - ic(low_res_features.shape) - # ic(features); exit() + last_hidden_state = encodings[0] + last_hidden_state, global_features = self.decoder(last_hidden_state) if self.use_fov: fov_out = self.fov_model( pixel_values=pixel_values, - low_res_features=low_res_features.detach(), + global_features=global_features.detach(), head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -1261,7 +1324,8 @@ def forward( else: fov_out = None - return features, fov_out + # TODO: return all hidden_states + return last_hidden_state, fov_out class DepthProDepthEstimationHead(nn.Module): @@ -1375,18 +1439,16 @@ def forward( outputs = [None] * 4 - hidden_states, fov_out = self.depth_pro( + last_hidden_state, fov_out = self.depth_pro( pixel_values=pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) - predicted_depth = self.head(hidden_states) - ic(predicted_depth.shape) - ic(fov_out.shape) + predicted_depth = self.head(last_hidden_state) - # ic(predicted_depth); exit() + ic(predicted_depth) ic(fov_out); exit() if not return_dict: From 11ce50c5cf2c87839909da806b1a9dc1665c11f2 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 12 Nov 2024 10:49:46 +0500 Subject: [PATCH 03/72] update model outputs --- .../models/depth_pro/modeling_depth_pro.py | 77 ++++++++++++++----- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 74669bc4e55753..daa2bbbdd64ba8 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -24,9 +24,10 @@ from torch import nn from dataclasses import dataclass +from ...utils import ModelOutput from ...activations import ACT2FN from ...modeling_outputs import ( - BaseModelOutput, + BaseModelOutput, DepthEstimatorOutput ) from ...utils import ( add_code_sample_docstrings, @@ -1232,6 +1233,18 @@ def _init_weights(self, module): """ +@dataclass +class DepthProModelOutput(BaseModelOutput): + """ + Base class for model's outputs, with potential fov, hidden states and attentions. + + Args: + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + Field of View Scaler. + """ + fov: Optional[torch.FloatTensor] = None + + @add_start_docstrings( "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", DEPTH_PRO_START_DOCSTRING, @@ -1306,14 +1319,14 @@ def forward( head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) - last_hidden_state = encodings[0] + last_hidden_state = encodings.last_hidden_state last_hidden_state, global_features = self.decoder(last_hidden_state) if self.use_fov: - fov_out = self.fov_model( + fov_encodings = self.fov_model( pixel_values=pixel_values, global_features=global_features.detach(), head_mask=head_mask, @@ -1321,11 +1334,24 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) + fov = fov_encodings.last_hidden_state else: - fov_out = None + fov = None + + attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + + if not return_dict: + outputs = (last_hidden_state, fov, hidden_states, attentions) + outputs = (i for i in outputs if i is not None) + return outputs - # TODO: return all hidden_states - return last_hidden_state, fov_out + return DepthProModelOutput( + last_hidden_state=last_hidden_state, + fov=fov, + hidden_states=hidden_states, + attentions=attentions, + ) class DepthProDepthEstimationHead(nn.Module): @@ -1360,6 +1386,18 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: return predicted_depth +@dataclass +class DepthProDepthEstimatorOutput(DepthEstimatorOutput): + """ + Base class for outputs of DepthProDepthEstimator. + + Args: + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + Field of View Scaler. + """ + fov: Optional[torch.FloatTensor] = None + + @add_start_docstrings( """ DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). @@ -1436,31 +1474,28 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + # use_fov = use_fov if use_fov is not None else self.config.use_fov - outputs = [None] * 4 - - last_hidden_state, fov_out = self.depth_pro( + depth_pro_outputs = self.depth_pro( pixel_values=pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) + last_hidden_state = depth_pro_outputs[0] predicted_depth = self.head(last_hidden_state) - ic(predicted_depth) - ic(fov_out); exit() - if not return_dict: - if output_hidden_states: - output = (predicted_depth,) + outputs[1:] + if loss is None: + return (predicted_depth,) + depth_pro_outputs[1:] else: - output = (predicted_depth,) + outputs[2:] - return ((loss,) + output) if loss is not None else output + return (loss, predicted_depth) + depth_pro_outputs[1:] - return DepthEstimatorOutput( + return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, - # hidden_states=outputs.hidden_states, - # attentions=outputs.attentions, + fov=depth_pro_outputs.fov, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, ) From 27e9593ada48c5c17a3a96e67bff534e022359ad Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:23:03 +0500 Subject: [PATCH 04/72] update init param to include use_fov_model --- .../models/depth_pro/modeling_depth_pro.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index daa2bbbdd64ba8..f8b69bfec86eb6 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1239,7 +1239,7 @@ class DepthProModelOutput(BaseModelOutput): Base class for model's outputs, with potential fov, hidden states and attentions. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ fov: Optional[torch.FloatTensor] = None @@ -1250,17 +1250,17 @@ class DepthProModelOutput(BaseModelOutput): DEPTH_PRO_START_DOCSTRING, ) class DepthProModel(DepthProPreTrainedModel): - def __init__(self, config): + def __init__(self, config, use_fov_model=None): super().__init__(config) self.config = config - self.use_fov = config.use_fov + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model # dinov2 (vit) like encoder self.encoder = DepthProEncoder(config) # dpt (vit) like decoder self.decoder = DepthProDecoder(config) # dinov2 (vit) like encoder - self.fov_model = DepthProFOVModel(config) if self.use_fov else None + self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None # Initialize weights and apply final processing self.post_init() @@ -1325,7 +1325,7 @@ def forward( last_hidden_state = encodings.last_hidden_state last_hidden_state, global_features = self.decoder(last_hidden_state) - if self.use_fov: + if self.use_fov_model: fov_encodings = self.fov_model( pixel_values=pixel_values, global_features=global_features.detach(), @@ -1392,7 +1392,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): Base class for outputs of DepthProDepthEstimator. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ fov: Optional[torch.FloatTensor] = None @@ -1405,10 +1405,11 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): DEPTH_PRO_START_DOCSTRING, ) class DepthProForDepthEstimation(DepthProPreTrainedModel): - def __init__(self, config): + def __init__(self, config, use_fov_model=None): super().__init__(config) + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - self.depth_pro = DepthProModel(config) + self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model) self.head = DepthProDepthEstimationHead(config) # Initialize weights and apply final processing @@ -1474,7 +1475,6 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - # use_fov = use_fov if use_fov is not None else self.config.use_fov depth_pro_outputs = self.depth_pro( pixel_values=pixel_values, From e74a7f505f91a24117e7838e367b72a50ff9e8f1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:24:21 +0500 Subject: [PATCH 05/72] update param name in config --- src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 7e66e679c67ff1..a4037c99ee0fc0 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -142,7 +142,7 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, - use_fov=False, + use_fov_model=False, **kwargs, ): super().__init__(**kwargs) @@ -173,7 +173,7 @@ def __init__( self.patch_encoder_hook_ids = patch_encoder_hook_ids self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder - self.use_fov = use_fov + self.use_fov_model = use_fov_model self.intermediate_feature_dims = intermediate_feature_dims self.intermediate_upsample_layers = intermediate_upsample_layers From 8c2460b0655dd3ef698b765eb64c79cc785c7d10 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:51:56 +0500 Subject: [PATCH 06/72] fix hidden_states and attentions outputs for fov --- src/transformers/models/depth_pro/modeling_depth_pro.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f8b69bfec86eb6..620133771c0674 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1332,14 +1332,15 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) fov = fov_encodings.last_hidden_state + attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None else: fov = None - - attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + attentions = encodings.attentions + hidden_states = encodings.hidden_states if not return_dict: outputs = (last_hidden_state, fov, hidden_states, attentions) From 55f6ed3439cef2a731b8b78cba3b6142e3125447 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 11:20:56 +0500 Subject: [PATCH 07/72] sort config --- .../models/depth_pro/configuration_depth_pro.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index a4037c99ee0fc0..16ff55e9cb6c94 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -129,10 +129,7 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, - patch_encoder_feature_dims = [256, 512, 1024, 1024], - patch_encoder_hook_ids = [5, 11], - # patch_encoder_hook_ids = [5, 11, 17, 23], intermediate_feature_dims = [256, 256], intermediate_upsample_layers = [3, 2], high_res_feature_dims = 512, @@ -140,7 +137,6 @@ def __init__( low_res_feature_dims = 1024, image_feature_dims = 1024, global_feature_dims = 1024, - use_batch_norm_in_decoder=False, use_fov_model=False, **kwargs, @@ -171,10 +167,8 @@ def __init__( self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states self.patch_encoder_hook_ids = patch_encoder_hook_ids - self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model - self.intermediate_feature_dims = intermediate_feature_dims self.intermediate_upsample_layers = intermediate_upsample_layers self.high_res_feature_dims = high_res_feature_dims From b25dffb5d7f0aef86bb7c2dac990c24b28dafb5a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 11:21:13 +0500 Subject: [PATCH 08/72] complete minor todos --- .../models/depth_pro/modeling_depth_pro.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 620133771c0674..956fe7afb7f7b9 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -938,8 +938,7 @@ def forward( last_hidden_state = self.encoder_neck(last_hidden_state) - # TODO: add some comments - last_hidden_state = last_hidden_state[:, 1:] + last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token last_hidden_state = last_hidden_state.permute(0, 2, 1) global_features = self.global_neck(global_features) @@ -1357,10 +1356,10 @@ def forward( class DepthProDepthEstimationHead(nn.Module): """ - # TODO - Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples - the predictions to the input resolution after the first convolutional layer (details can be found in the paper's - supplementary material). + The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. + This module comprises a sequence of convolutional and transposed convolutional layers + that process the feature map from the decoder to produce a single-channel depth map. + Key operations include dimensionality reduction and upsampling to match the input resolution. """ def __init__(self, config): From c225deb0d126a8420ccb5e381fa2e120abedabf0 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 13:22:15 +0500 Subject: [PATCH 09/72] update patching --- .../models/depth_pro/modeling_depth_pro.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 956fe7afb7f7b9..59b6d46e30cae2 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -685,23 +685,25 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - patch_size = 384 # TODO: this should be infered - patch_stride = int(patch_size * (1 - overlap_ratio)) + B, C, H, W = pixel_values.shape - image_size = pixel_values.shape[-1] - steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1 + patch_size = 384 # TODO: this should be inferred + stride = int(patch_size * (1 - overlap_ratio)) - x_patch_list = [] - for j in range(steps): - j0 = j * patch_stride - j1 = j0 + patch_size + if pixel_values.dim() != 4: + raise ValueError("Input tensor must have shape (B, C, H, W).") - for i in range(steps): - i0 = i * patch_stride - i1 = i0 + patch_size - x_patch_list.append(pixel_values[..., j0:j1, i0:i1]) + # pixel_values.shape (B, C, H, W) + patches = torch.nn.functional.unfold( + pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) + ) + # patches.shape (B, -1, num_patches) + patches = patches.permute(2, 0, 1) + # patches.shape (num_patches, B, -1) + patches = patches.reshape(-1, C, patch_size, patch_size) + # patches.shape (B * num_patches, C, patch_size, patch_size) - return torch.cat(x_patch_list, dim=0) + return patches def _reshape_feature( self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 @@ -760,7 +762,7 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - batch_size = pixel_values.shape[0] + B, C, H, W = pixel_values.shape # STEP 1: create 3-level image @@ -812,8 +814,8 @@ def forward( ) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) - med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) + high_res_features = self._merge(high_res_features, batch_size=B, padding=3) + med_res_features = self._merge(med_res_features, batch_size=B, padding=6) low_res_features = low_res_features # no merge required with low res image # d. upsample @@ -838,7 +840,7 @@ def forward( # c. merge patches back together features = self._merge( - features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + features[: B * 5 * 5], batch_size=B, padding=3 ) # d. upsample From 176932dc6aba7bfaf541bee756fc493f541434dd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 16:35:43 +0500 Subject: [PATCH 10/72] update config for encoder --- .../depth_pro/configuration_depth_pro.py | 14 ++- .../models/depth_pro/modeling_depth_pro.py | 108 ++++++++++-------- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 16ff55e9cb6c94..cdf3cf4d8d7077 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -119,7 +119,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-6, image_size=384, - patch_size=16, # changed + patch_size=16, # TODO remove this num_channels=3, qkv_bias=True, layerscale_value=1.0, @@ -139,6 +139,13 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, use_fov_model=False, + + # aux_image_size=1536, + # aux_patch_size=384, + aux_image_size=1536 // 2, + aux_patch_size=384 // 2, + aux_num_channels=3, + patch_embeddings_size=16, **kwargs, ): super().__init__(**kwargs) @@ -176,3 +183,8 @@ def __init__( self.low_res_feature_dims = low_res_feature_dims self.image_feature_dims = image_feature_dims self.global_feature_dims = global_feature_dims + + self.aux_image_size = aux_image_size + self.aux_patch_size = aux_patch_size + self.aux_num_channels = aux_num_channels + self.patch_embeddings_size = patch_embeddings_size diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 59b6d46e30cae2..3d3d356cc0eeb2 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -55,22 +55,22 @@ class DepthProViTPatchEmbeddings(nn.Module): def __init__(self, config): super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.hidden_size - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + self.config = config + self.in_channels = config.aux_num_channels + self.out_channels = config.hidden_size + self.patch_embeddings_size = config.patch_embeddings_size + + self.projection = nn.Conv2d( + self.in_channels, + self.out_channels, + kernel_size=(self.patch_embeddings_size, self.patch_embeddings_size), + stride=(self.patch_embeddings_size, self.patch_embeddings_size), + ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.num_channels: + if num_channels != self.config.aux_num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -89,10 +89,12 @@ class DepthProViTEmbeddings(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() + self.config = config + self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2 + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) - num_patches = self.patch_embeddings.num_patches - self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.patch_size = config.patch_size self.config = config @@ -107,11 +109,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_patches = embeddings.shape[1] - 1 num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes - if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: return self.position_embeddings class_pos_embed = self.position_embeddings[:, :1] @@ -119,8 +120,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: dim = embeddings.shape[-1] - new_height = height // self.patch_size - new_width = width // self.patch_size + new_height = height // self.patch_size # TODO: check this + new_width = width // self.patch_size # TODO: check this sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) @@ -621,8 +622,9 @@ def __init__(self, config: DepthProConfig) -> None: self.patch_encoder_hook_ids = config.patch_encoder_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims self.intermediate_upsample_layers = config.intermediate_upsample_layers - - self.out_size = 24 # TODO: image_size // patch_size + + self.out_size = config.aux_patch_size // config.patch_embeddings_size + self.seq_len = self.out_size ** 2 # patch encoder self.patch_encoder = DepthProViT(config) @@ -685,23 +687,18 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - B, C, H, W = pixel_values.shape - - patch_size = 384 # TODO: this should be inferred + patch_size = self.config.aux_patch_size stride = int(patch_size * (1 - overlap_ratio)) - if pixel_values.dim() != 4: - raise ValueError("Input tensor must have shape (B, C, H, W).") - - # pixel_values.shape (B, C, H, W) + # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) patches = torch.nn.functional.unfold( pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) ) # patches.shape (B, -1, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, C, patch_size, patch_size) - # patches.shape (B * num_patches, C, patch_size, patch_size) + patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size) + # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) return patches @@ -762,24 +759,33 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if pixel_values.dim() != 4: + raise ValueError("Input tensor must have shape (B, C, H, W).") + B, C, H, W = pixel_values.shape + # TODO validate: H = W = aux_image_size + # TODO validate: C = aux_num_channels + # TODO validate: aux_image_size = aux_patch_size * 4 + + # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # STEP 1: create 3-level image - high_res = pixel_values - med_res = self._interpolate(pixel_values, 0.5) - low_res = self._interpolate(pixel_values, 0.25) + high_res = pixel_values # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + med_res = self._interpolate(pixel_values, 0.5) # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2) + low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) - med_res_patches = self._patch(med_res, 0.5) - low_res_patches = low_res + high_res_patches = self._patch(high_res, 0.25) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + med_res_patches = self._patch(med_res, 0.5) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + low_res_patches = low_res # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) patches = torch.cat( (high_res_patches, med_res_patches, low_res_patches), dim=0, - ) + ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) # STEP 3: apply patch and image encoder @@ -801,42 +807,43 @@ def forward( # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram # a. extract hidden_state - hidden_state = patch_encodings.last_hidden_state + hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) + ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) high_res_features, med_res_features, low_res_features = torch.split( features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, - ) + ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, padding=3) - med_res_features = self._merge(med_res_features, batch_size=B, padding=6) - low_res_features = low_res_features # no merge required with low res image + high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) + med_res_features = self._merge(med_res_features, batch_size=B, padding=6) # (B, config.hidden_size, ~, ~) + low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, ~, ~) # d. upsample - high_res_features = self.upsample_high_res(high_res_features) - med_res_features = self.upsample_med_res(med_res_features) - low_res_features = self.upsample_low_res(low_res_features) + high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, ~, ~) + med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, ~, ~) + low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, ~, ~) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] for layer_id in self.patch_encoder_hook_ids: - + # a. extract hidden_state hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well + # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size, - ) + ) # (num_patches, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = self._merge( @@ -845,24 +852,25 @@ def forward( # d. upsample features = self.upsample_intermediate[layer_id](features) + # (B, config.intermediate_feature_dims[layer_id], ~, ~) intermediate_features.append(features) # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state + hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) + ) # (num_patches, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together # skipped, no merge required with low res image # d. upsample - image_features = self.upsample_image(image_features) + image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~) # STEP 7: return these features last_hidden_state = [ From dcec5228b21352f6638c27c91f1d4056323eba95 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 16:46:17 +0500 Subject: [PATCH 11/72] fix config --- .../depth_pro/configuration_depth_pro.py | 20 +++----- .../models/depth_pro/modeling_depth_pro.py | 48 +++++++++---------- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index cdf3cf4d8d7077..fc12b37b19d073 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -118,9 +118,12 @@ def __init__( attention_probs_dropout_prob=0.0, initializer_range=0.02, layer_norm_eps=1e-6, - image_size=384, - patch_size=16, # TODO remove this + # image_size=1536, + # patch_size=384, + image_size=1536 // 2, + patch_size=384 // 2, num_channels=3, + patch_embeddings_size=16, qkv_bias=True, layerscale_value=1.0, drop_path_rate=0.0, @@ -139,13 +142,6 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, use_fov_model=False, - - # aux_image_size=1536, - # aux_patch_size=384, - aux_image_size=1536 // 2, - aux_patch_size=384 // 2, - aux_num_channels=3, - patch_embeddings_size=16, **kwargs, ): super().__init__(**kwargs) @@ -163,6 +159,7 @@ def __init__( self.image_size = image_size self.patch_size = patch_size self.num_channels = num_channels + self.patch_embeddings_size = patch_embeddings_size self.qkv_bias = qkv_bias self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate @@ -183,8 +180,3 @@ def __init__( self.low_res_feature_dims = low_res_feature_dims self.image_feature_dims = image_feature_dims self.global_feature_dims = global_feature_dims - - self.aux_image_size = aux_image_size - self.aux_patch_size = aux_patch_size - self.aux_num_channels = aux_num_channels - self.patch_embeddings_size = patch_embeddings_size diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3d3d356cc0eeb2..d5639131397923 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -57,7 +57,7 @@ def __init__(self, config): super().__init__() self.config = config - self.in_channels = config.aux_num_channels + self.in_channels = config.num_channels self.out_channels = config.hidden_size self.patch_embeddings_size = config.patch_embeddings_size @@ -70,7 +70,7 @@ def __init__(self, config): def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.config.aux_num_channels: + if num_channels != self.config.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -90,14 +90,12 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2 + self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2 self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.patch_size = config.patch_size - self.config = config def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ @@ -120,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: dim = embeddings.shape[-1] - new_height = height // self.patch_size # TODO: check this - new_width = width // self.patch_size # TODO: check this + new_height = height // self.config.patch_embeddings_size + new_width = width // self.config.patch_embeddings_size sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) @@ -623,7 +621,7 @@ def __init__(self, config: DepthProConfig) -> None: self.intermediate_feature_dims = config.intermediate_feature_dims self.intermediate_upsample_layers = config.intermediate_upsample_layers - self.out_size = config.aux_patch_size // config.patch_embeddings_size + self.out_size = config.patch_size // config.patch_embeddings_size self.seq_len = self.out_size ** 2 # patch encoder @@ -687,18 +685,18 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - patch_size = self.config.aux_patch_size + patch_size = self.config.patch_size stride = int(patch_size * (1 - overlap_ratio)) - # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) patches = torch.nn.functional.unfold( pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) ) # patches.shape (B, -1, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size) - # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size) + # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size) return patches @@ -764,28 +762,28 @@ def forward( B, C, H, W = pixel_values.shape - # TODO validate: H = W = aux_image_size - # TODO validate: C = aux_num_channels - # TODO validate: aux_image_size = aux_patch_size * 4 + # TODO validate: H = W = image_size + # TODO validate: C = num_channels + # TODO validate: image_size = patch_size * 4 - # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) # STEP 1: create 3-level image - high_res = pixel_values # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) - med_res = self._interpolate(pixel_values, 0.5) # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2) - low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4) + high_res = pixel_values # (B, config.num_channels, config.image_size, config.image_size) + med_res = self._interpolate(pixel_values, 0.5) # (B, config.num_channels, config.image_size//2, config.image_size//2) + low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) - med_res_patches = self._patch(med_res, 0.5) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) - low_res_patches = low_res # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + high_res_patches = self._patch(high_res, 0.25) # (-1, config.num_channels, config.patch_size, config.patch_size) + med_res_patches = self._patch(med_res, 0.5) # (-1, config.num_channels, config.patch_size, config.patch_size) + low_res_patches = low_res # (-1, config.num_channels, config.patch_size, config.patch_size) patches = torch.cat( (high_res_patches, med_res_patches, low_res_patches), dim=0, - ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + ) # (num_patches, config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -812,12 +810,12 @@ def forward( # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) + ) # (num_patches, config.num_channels, self.out_size, self.out_size) high_res_features, med_res_features, low_res_features = torch.split( features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, - ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) + ) # (num_patches, config.num_channels, self.out_size, self.out_size) # c. merge patches back together high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) From 0384d2f189062259b3b99a3d692593e28902ec0b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 19:37:00 +0500 Subject: [PATCH 12/72] use correct defaults in config --- .../models/depth_pro/configuration_depth_pro.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index fc12b37b19d073..aff3eb3e2941ac 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -118,10 +118,8 @@ def __init__( attention_probs_dropout_prob=0.0, initializer_range=0.02, layer_norm_eps=1e-6, - # image_size=1536, - # patch_size=384, - image_size=1536 // 2, - patch_size=384 // 2, + image_size=1536, + patch_size=384, num_channels=3, patch_embeddings_size=16, qkv_bias=True, From 85e4f868b65fa5b208883cb973824ca6e2557db8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 17 Nov 2024 23:47:50 +0500 Subject: [PATCH 13/72] update merge for compatibility with different image size --- .../depth_pro/configuration_depth_pro.py | 6 +- .../models/depth_pro/modeling_depth_pro.py | 135 +++++++++++------- 2 files changed, 88 insertions(+), 53 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index aff3eb3e2941ac..d9f973639ad0fd 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -108,9 +108,9 @@ class DepthProConfig(PretrainedConfig): def __init__( self, - hidden_size=1024, # changed + hidden_size=1024, decoder_hidden_size=256, - num_hidden_layers=24, # changed + num_hidden_layers=24, num_attention_heads=16, mlp_ratio=4, hidden_act="gelu", @@ -132,7 +132,6 @@ def __init__( reshape_hidden_states=True, patch_encoder_hook_ids = [5, 11], intermediate_feature_dims = [256, 256], - intermediate_upsample_layers = [3, 2], high_res_feature_dims = 512, med_res_feature_dims = 1024, low_res_feature_dims = 1024, @@ -172,7 +171,6 @@ def __init__( self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model self.intermediate_feature_dims = intermediate_feature_dims - self.intermediate_upsample_layers = intermediate_upsample_layers self.high_res_feature_dims = high_res_feature_dims self.med_res_feature_dims = med_res_feature_dims self.low_res_feature_dims = low_res_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index d5639131397923..316afe444fbb62 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -619,7 +619,6 @@ def __init__(self, config: DepthProConfig) -> None: self.decoder_hidden_size = config.decoder_hidden_size self.patch_encoder_hook_ids = config.patch_encoder_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims - self.intermediate_upsample_layers = config.intermediate_upsample_layers self.out_size = config.patch_size // config.patch_embeddings_size self.seq_len = self.out_size ** 2 @@ -632,17 +631,15 @@ def __init__(self, config: DepthProConfig) -> None: # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() - for i, (feature_dims, upsample_layers) in enumerate(zip( - self.intermediate_feature_dims, - self.intermediate_upsample_layers, - )): + for i, feature_dims in enumerate(self.intermediate_feature_dims): intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims upsample_block = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=upsample_layers, + n_upsample_layers=1+len(self.intermediate_feature_dims)-i, ) + self.upsample_intermediate.append(upsample_block) # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram @@ -714,34 +711,46 @@ def _reshape_feature( hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) return hidden_states - def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor: + def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: """Merge the patched input into a image with sliding window.""" - steps = int(math.sqrt(x.shape[0] // batch_size)) - - idx = 0 - - output_list = [] - for j in range(steps): - output_row_list = [] - for i in range(steps): - output = x[batch_size * idx : batch_size * (idx + 1)] + # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) + box_size = int(math.sqrt(x.shape[0] // batch_size)) - if j != 0: - output = output[..., padding:, :] - if i != 0: - output = output[..., :, padding:] - if j != steps - 1: - output = output[..., :-padding, :] - if i != steps - 1: - output = output[..., :, :-padding] - - output_row_list.append(output) - idx += 1 - - output_row = torch.cat(output_row_list, dim=-1) - output_list.append(output_row) - output = torch.cat(output_list, dim=-2) - return output + """ + merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) + padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) + """ + padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 ) + + i = 0 + boxes = [] + for h in range(box_size): + boxes_in_row = [] + for w in range(box_size): + box = x[batch_size * i : batch_size * (i + 1)] + + if h != 0: + # remove pad from height if box is not at top border + box = box[..., padding:, :] + if w != 0: + # remove pad from width if box is not at left border + box = box[..., :, padding:] + if h != box_size - 1: + # remove pad from height if box is not at bottom border + box = box[..., :box.shape[-2]-padding, :] + if w != box_size - 1: + # remove pad from width if box is not at right border + box = box[..., :, :box.shape[-1]-padding] + + boxes_in_row.append(box) + i += 1 + + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + + boxes = torch.cat(boxes, dim=-2) + boxes = boxes[..., :merge_out_size, :merge_out_size] + return boxes def forward( self, @@ -818,19 +827,19 @@ def forward( ) # (num_patches, config.num_channels, self.out_size, self.out_size) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) - med_res_features = self._merge(med_res_features, batch_size=B, padding=6) # (B, config.hidden_size, ~, ~) - low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, ~, ~) + high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) + med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2) # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1) + low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0) # d. upsample - high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, ~, ~) - med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, ~, ~) - low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, ~, ~) + high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3) + med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2) + low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] - for layer_id in self.patch_encoder_hook_ids: + for i, layer_id in enumerate(self.patch_encoder_hook_ids): # a. extract hidden_state hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well @@ -845,12 +854,12 @@ def forward( # c. merge patches back together features = self._merge( - features[: B * 5 * 5], batch_size=B, padding=3 - ) + features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4, + ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) # d. upsample features = self.upsample_intermediate[layer_id](features) - # (B, config.intermediate_feature_dims[layer_id], ~, ~) + # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i)) intermediate_features.append(features) @@ -868,16 +877,25 @@ def forward( # skipped, no merge required with low res image # d. upsample - image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~) + image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) # STEP 7: return these features last_hidden_state = [ - *intermediate_features, - high_res_features, - med_res_features, - low_res_features, - image_features, + *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i) + high_res_features, # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3) + med_res_features, # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2) + low_res_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) + image_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) ] + # for i in last_hidden_state: + # ic(i.shape) + # exit() + + # 768, 384, 192, 96, 48, 48 - image_size=1536 + # 384, 192, 96, 48, 24, 24 - image_size=768 (ideal) + # 288, 144, 72, 24, 24, 24 - image_size=768 (practical) + # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal) + # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical) hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None @@ -951,6 +969,11 @@ def forward( global_features = self.global_neck(global_features) + ic(last_hidden_state.shape) + ic(global_features.shape) + + # exit() + last_hidden_state = last_hidden_state.reshape_as(global_features) last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) @@ -1107,7 +1130,15 @@ def __init__(self, config: DepthProConfig) -> None: for i, feature_dim in enumerate(config.intermediate_feature_dims): if i == 0: # no projection for final intermediate layer - proj = nn.Identity() + if feature_dim == config.decoder_hidden_size: + proj = nn.Identity() + else: + proj = nn.Conv2d( + in_channels=feature_dim, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) fusion = DepthProFeatureFusionLayer(config, use_deconv=False) else: proj = nn.Conv2d( @@ -1124,6 +1155,10 @@ def __init__(self, config: DepthProConfig) -> None: self.intermediate_fusion.append(fusion) def forward(self, hidden_states): + ic("Start of Decoder") + + for i in hidden_states: + ic(i.shape) # STEP 1: extract features @@ -1492,7 +1527,9 @@ def forward( return_dict=True, ) last_hidden_state = depth_pro_outputs[0] + ic(last_hidden_state.shape) predicted_depth = self.head(last_hidden_state) + ic(predicted_depth.shape) if not return_dict: if loss is None: From 00e4aa3b7bb04324cd08f2f87a2a34f4033fccca Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 11:04:58 +0500 Subject: [PATCH 14/72] restructure encoder for custom configuration --- .../depth_pro/configuration_depth_pro.py | 21 +- .../models/depth_pro/modeling_depth_pro.py | 842 ++++++++---------- 2 files changed, 395 insertions(+), 468 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d9f973639ad0fd..0558309004171f 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -59,6 +59,7 @@ class DepthProConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. image_size (`int`, *optional*, defaults to 224): + TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size The size (resolution) of each image. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. @@ -130,13 +131,11 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, - patch_encoder_hook_ids = [5, 11], + intermediate_hook_ids = [11, 5], intermediate_feature_dims = [256, 256], - high_res_feature_dims = 512, - med_res_feature_dims = 1024, - low_res_feature_dims = 1024, - image_feature_dims = 1024, - global_feature_dims = 1024, + scaled_images_ratios = [0.25, 0.5, 1], + scaled_images_overlap_ratios = [0.0, 0.5, 0.25], + scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_decoder=False, use_fov_model=False, **kwargs, @@ -167,12 +166,10 @@ def __init__( ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.patch_encoder_hook_ids = patch_encoder_hook_ids self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model + self.intermediate_hook_ids = intermediate_hook_ids self.intermediate_feature_dims = intermediate_feature_dims - self.high_res_feature_dims = high_res_feature_dims - self.med_res_feature_dims = med_res_feature_dims - self.low_res_feature_dims = low_res_feature_dims - self.image_feature_dims = image_feature_dims - self.global_feature_dims = global_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 316afe444fbb62..9f146177402c00 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -226,7 +226,6 @@ def forward( self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' @@ -617,11 +616,40 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.decoder_hidden_size = config.decoder_hidden_size - self.patch_encoder_hook_ids = config.patch_encoder_hook_ids + + self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) self.out_size = config.patch_size // config.patch_embeddings_size - self.seq_len = self.out_size ** 2 + self.seq_len = self.out_size ** 2 # each patch is flattened + + # config.scaled_images_ratios is sorted + if config.scaled_images_ratios != sorted(config.scaled_images_ratios): + raise ValueError( + f"Values in scaled_images_ratios={config.scaled_images_ratios} " + "should be sorted from low to high" + ) + + # lowest image resolution is greator than the patch_size + if config.scaled_images_ratios[0] * config.image_size < config.patch_size: + raise ValueError( + "Image cannot be scaled to a size less than patch_size. " + f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable " + f"to the given patch_size={config.patch_size}." + ) + + # patch_size should be a divisible by patch_embeddings_size + # else it raises an exception in DepthProViTPatchEmbeddings + if config.patch_size % config.patch_embeddings_size != 0: + raise ValueError( + f"patch_size={config.patch_size} should be divisible " + f"by patch_embeddings_size={config.patch_embeddings_size}." + ) # patch encoder self.patch_encoder = DepthProViT(config) @@ -629,6 +657,17 @@ def __init__(self, config: DepthProConfig) -> None: # image encoder self.image_encoder = DepthProViT(config) + # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram + self.upsample_scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(self.scaled_images_feature_dims): + upsample_block = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.upsample_scaled_images.append(upsample_block) + # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() for i, feature_dims in enumerate(self.intermediate_feature_dims): @@ -637,42 +676,33 @@ def __init__(self, config: DepthProConfig) -> None: input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=1+len(self.intermediate_feature_dims)-i, + n_upsample_layers=2+i, ) - self.upsample_intermediate.append(upsample_block) - # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram - self.upsample_high_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.high_res_feature_dims, - output_dims=config.high_res_feature_dims, - n_upsample_layers=1, - ) - self.upsample_med_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.med_res_feature_dims, - output_dims=config.med_res_feature_dims, - n_upsample_layers=1, - ) - self.upsample_low_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.low_res_feature_dims, - output_dims=config.low_res_feature_dims, - n_upsample_layers=1, - ) - # upsampling image features - (6) in diagram self.upsample_image = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=config.hidden_size, - output_dims=config.image_feature_dims, + output_dims=config.scaled_images_feature_dims[0], n_upsample_layers=1, use_proj=False, bias=True, ) + # for STEP 7: fuse low_res and image features + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0]*2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + def _interpolate(self, pixel_values, scale_factor): + if scale_factor == 1: + return pixel_values return nn.functional.interpolate( pixel_values, size=None, @@ -682,6 +712,10 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): + if pixel_values.shape[-1] == self.config.patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + patch_size = self.config.patch_size stride = int(patch_size * (1 - overlap_ratio)) @@ -712,7 +746,11 @@ def _reshape_feature( return hidden_states def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: - """Merge the patched input into a image with sliding window.""" + if batch_size == x.shape[0]: + # merge only if the patches were created from this scaled image + # pathces are not created when scaled image size is equal to patch size + return x + # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) box_size = int(math.sqrt(x.shape[0] // batch_size)) @@ -771,28 +809,35 @@ def forward( B, C, H, W = pixel_values.shape - # TODO validate: H = W = image_size - # TODO validate: C = num_channels - # TODO validate: image_size = patch_size * 4 + if not (H == W == self.config.image_size): + raise ValueError( + f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config." + ) + + if not (C == self.config.num_channels): + raise ValueError( + f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config." + ) # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) # STEP 1: create 3-level image - high_res = pixel_values # (B, config.num_channels, config.image_size, config.image_size) - med_res = self._interpolate(pixel_values, 0.5) # (B, config.num_channels, config.image_size//2, config.image_size//2) - low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4) + scaled_images = [] + for ratio in self.scaled_images_ratios: + scaled_images.append(self._interpolate(pixel_values, ratio)) + # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) # (-1, config.num_channels, config.patch_size, config.patch_size) - med_res_patches = self._patch(med_res, 0.5) # (-1, config.num_channels, config.patch_size, config.patch_size) - low_res_patches = low_res # (-1, config.num_channels, config.patch_size, config.patch_size) - - patches = torch.cat( - (high_res_patches, med_res_patches, low_res_patches), - dim=0, - ) # (num_patches, config.num_channels, config.patch_size, config.patch_size) + for i in range(self.n_scaled_images): + scaled_images[i] = self._patch( + scaled_images[i], + overlap_ratio=self.scaled_images_overlap_ratios[i], + ) + scaled_images_num_patches = [len(i) for i in scaled_images] + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -803,8 +848,13 @@ def forward( output_hidden_states=True, # required for intermediate features return_dict=True, ) + scaled_images_last_hidden_state = torch.split_with_sizes( + patch_encodings.last_hidden_state, + scaled_images_num_patches[::-1] + )[::-1] # -1 as patch encoder expects high res patches first + image_encodings = self.image_encoder( - pixel_values=low_res_patches, + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -813,89 +863,87 @@ def forward( # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram - # a. extract hidden_state - hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) + scaled_images_features = [] + for i in range(self.n_scaled_images): + # a. extract hidden_state + hidden_state = scaled_images_last_hidden_state[i] + # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size) - # b. reshape back to image like - features = self._reshape_feature( - hidden_state, self.out_size, self.out_size - ) # (num_patches, config.num_channels, self.out_size, self.out_size) - high_res_features, med_res_features, low_res_features = torch.split( - features, - [len(high_res_patches), len(med_res_patches), len(low_res_patches)], - dim=0, - ) # (num_patches, config.num_channels, self.out_size, self.out_size) + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, self.out_size, self.out_size + ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) - # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) - med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2) # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1) - low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0) + # c. merge patches back together + features = self._merge( + features, batch_size=B, merge_out_size=self.out_size*2**i + ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) - # d. upsample - high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3) - med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2) - low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1) + # d. upsample + features = self.upsample_scaled_images[i](features) + # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) + + scaled_images_features.append(features) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] - for i, layer_id in enumerate(self.patch_encoder_hook_ids): + for i in range(self.n_intermediate_hooks): # a. extract hidden_state - hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well - # (num_patches, self.seq_len+1, config.hidden_size) + layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well + hidden_state = patch_encodings.hidden_states[layer_id] + hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution + # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size, - ) # (num_patches, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = self._merge( - features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4, - ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) + features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - features = self.upsample_intermediate[layer_id](features) - # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i)) + features = self.upsample_intermediate[i](features) + # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) intermediate_features.append(features) # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) + hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) # (num_patches, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - # skipped, no merge required with low res image + image_features = self._merge( + image_features, batch_size=B, merge_out_size=self.out_size*2**(0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) - - # STEP 7: return these features - last_hidden_state = [ - *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i) - high_res_features, # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3) - med_res_features, # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2) - low_res_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) - image_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) + image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) + + # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0]) + # fuses image_features with lowest resolution features as they are of same size + scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1) + scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) + + # STEP 8: return these features in order of increasing size as what decoder expects + last_hidden_state = [ + # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) + *scaled_images_features, + # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) + *intermediate_features, ] - # for i in last_hidden_state: - # ic(i.shape) - # exit() - - # 768, 384, 192, 96, 48, 48 - image_size=1536 - # 384, 192, 96, 48, 24, 24 - image_size=768 (ideal) - # 288, 144, 72, 24, 24, 24 - image_size=768 (practical) - # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal) - # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical) hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None @@ -910,84 +958,133 @@ def forward( ) -class DepthProFOVModel(nn.Module): - def __init__(self, config: DepthProConfig) -> None: - super().__init__() +class DepthProPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DepthProViTSwiGLUFFN"] + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +DEPTH_PRO_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_PRO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", + DEPTH_PRO_START_DOCSTRING, +) +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) self.config = config - self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.encoder = DepthProEncoder(config) + # Initialize weights and apply final processing + self.post_init() - self.encoder = DepthProViT(config) - self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) - self.global_neck = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True) - ) - self.head = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), - ) + def get_input_embeddings(self): + embeddings = { + "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, + "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, + } + return embeddings + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # TODO + # @add_code_sample_docstrings( + # checkpoint=_CHECKPOINT_FOR_DOC, + # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, + # config_class=_CONFIG_FOR_DOC, + # modality="vision", + # expected_output=_EXPECTED_OUTPUT_SHAPE, + # ) def forward( self, - pixel_values: torch.Tensor, - global_features: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - pixel_values = nn.functional.interpolate( - pixel_values, - size=None, - scale_factor=0.25, - mode="bilinear", - align_corners=False, - ) - encoder_outputs = self.encoder( + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encodings = self.encoder( pixel_values, - head_mask=head_mask, + head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - - last_hidden_state = self.encoder_neck(last_hidden_state) - - last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token - last_hidden_state = last_hidden_state.permute(0, 2, 1) - - global_features = self.global_neck(global_features) - - ic(last_hidden_state.shape) - ic(global_features.shape) - # exit() - - last_hidden_state = last_hidden_state.reshape_as(global_features) - last_hidden_state = last_hidden_state + global_features - fov_output = self.head(last_hidden_state) - fov_output = fov_output.reshape(1) - - if not return_dict: - head_outputs = (fov_output,) - return head_outputs + encoder_outputs[1:] - - return BaseModelOutput( - last_hidden_state=fov_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + return encodings # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro @@ -1075,325 +1172,109 @@ def forward(self, hidden_state, residual=None): return hidden_state -# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage -class DepthProDecoder(nn.Module): - def __init__(self, config: DepthProConfig) -> None: +# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters +class DepthProFeatureFusionStage(nn.Module): + def __init__(self, config, num_layers): super().__init__() - self.config = config - - # for STEP 2: fuse low_res and image features - self.fuse_image_with_low_res = nn.Conv2d( - in_channels=config.low_res_feature_dims+config.image_feature_dims, - out_channels=config.global_feature_dims, - kernel_size=1, - stride=1, - padding=0, - bias=True, - ) - - # for STEP 3: apply decoder block for global features - self.global_proj = nn.Conv2d( - in_channels=config.global_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.global_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 4: apply decoder block for med features - self.med_res_proj = nn.Conv2d( - in_channels=config.med_res_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.med_res_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 5: apply decoder block for high features - self.high_res_proj = nn.Conv2d( - in_channels=config.high_res_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.high_res_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 6: apply decoder block for intermediate features - self.intermediate_proj = nn.Sequential() - self.intermediate_fusion = nn.Sequential() - for i, feature_dim in enumerate(config.intermediate_feature_dims): - if i == 0: - # no projection for final intermediate layer - if feature_dim == config.decoder_hidden_size: - proj = nn.Identity() - else: - proj = nn.Conv2d( - in_channels=feature_dim, - out_channels=config.decoder_hidden_size, - kernel_size=1, - bias=False, - ) - fusion = DepthProFeatureFusionLayer(config, use_deconv=False) - else: - proj = nn.Conv2d( - in_channels=feature_dim, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - fusion = DepthProFeatureFusionLayer(config) - - self.intermediate_proj.append(proj) - self.intermediate_fusion.append(fusion) + self.num_layers = num_layers + self.layers = nn.ModuleList() + for _ in range(self.num_layers-1): + self.layers.append(DepthProFeatureFusionLayer(config)) + # final layer doesnot require deconvolution + self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False)) def forward(self, hidden_states): - ic("Start of Decoder") - - for i in hidden_states: - ic(i.shape) - - # STEP 1: extract features - - intermediate_features = hidden_states[:-4] - # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384] - high_res_features = hidden_states[-4] - # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192] - med_res_features = hidden_states[-3] - # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96] - low_res_features = hidden_states[-2] - # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48] - image_features = hidden_states[-1] - # image_features.shape: [batch_size, config.image_feature_dims, 48, 48] - - # STEP 2: fuse low_res and image features - - global_features = torch.cat((low_res_features, image_features), dim=1) - global_features = self.fuse_image_with_low_res(global_features) - # global_features.shape: [batch_size, config.global_feature_dims, 48, 48] - - # STEP 3: apply decoder block for global features - - # apply projection: used by fusion now and then fov later - global_projected = self.global_proj(global_features) - # apply fusion: used by next projections and fusions - last_features = self.global_fusion(global_projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96] - - # STEP 4: apply decoder block for med features - - projected = self.med_res_proj(med_res_features) - last_features = self.med_res_fusion(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192] - - # STEP 5: apply decoder block for high features - - projected = self.high_res_proj(high_res_features) - last_features = self.high_res_fusion(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384] - - # STEP 6: apply decoder block for intermediate features - - for (features, proj_layer, fusion_layer) in zip( - # reversed becuase decoding is applied from last features to first features - intermediate_features[::-1], - self.intermediate_proj[::-1], - self.intermediate_fusion[::-1], - ): - projected = proj_layer(features) - last_features = fusion_layer(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] - # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] - - return last_features, global_projected - - -class DepthProPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = DepthProConfig - base_model_prefix = "depth_pro" - main_input_name = "pixel_values" - supports_gradient_checkpointing = True - _no_split_modules = ["DepthProViTSwiGLUFFN"] - _supports_sdpa = True - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -DEPTH_PRO_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -DEPTH_PRO_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] - for details. - - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. -""" - + if self.num_layers != len(hidden_states): + raise ValueError( + f"num_layers={self.num_layers} in DepthProFeatureFusionStage" + f"doesnot match len(hidden_states)={len(hidden_states)}" + ) -@dataclass -class DepthProModelOutput(BaseModelOutput): - """ - Base class for model's outputs, with potential fov, hidden states and attentions. + # first layer only uses the last hidden_state + fused_hidden_state = self.layers[0](hidden_states[0]) + # looping from the second layer to last layer + for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): + fused_hidden_state = layer(fused_hidden_state, hidden_state) - Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): - Field of View Scaler. - """ - fov: Optional[torch.FloatTensor] = None + return fused_hidden_state -@add_start_docstrings( - "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", - DEPTH_PRO_START_DOCSTRING, -) -class DepthProModel(DepthProPreTrainedModel): - def __init__(self, config, use_fov_model=None): - super().__init__(config) +class DepthProFOVModel(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() self.config = config - self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - - # dinov2 (vit) like encoder - self.encoder = DepthProEncoder(config) - # dpt (vit) like decoder - self.decoder = DepthProDecoder(config) - # dinov2 (vit) like encoder - self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - embeddings = { - "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, - "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, - } - if self.use_fov: - embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings - return embeddings + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) - self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) - self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder = DepthProViT(config) + self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.global_neck = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(True) + ) + self.head = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + ) - @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # TODO - # @add_code_sample_docstrings( - # checkpoint=_CHECKPOINT_FOR_DOC, - # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, - # config_class=_CONFIG_FOR_DOC, - # modality="vision", - # expected_output=_EXPECTED_OUTPUT_SHAPE, - # ) def forward( self, - pixel_values: torch.FloatTensor, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - encodings = self.encoder( + pixel_values = nn.functional.interpolate( pixel_values, - head_mask, + size=None, + scale_factor=0.25, + mode="bilinear", + align_corners=False, + ) + encoder_outputs = self.encoder( + pixel_values, + head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=True, + return_dict=return_dict, ) + last_hidden_state = encoder_outputs[0] - last_hidden_state = encodings.last_hidden_state - last_hidden_state, global_features = self.decoder(last_hidden_state) + last_hidden_state = self.encoder_neck(last_hidden_state) - if self.use_fov_model: - fov_encodings = self.fov_model( - pixel_values=pixel_values, - global_features=global_features.detach(), - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - fov = fov_encodings.last_hidden_state - attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None - else: - fov = None - attentions = encodings.attentions - hidden_states = encodings.hidden_states + last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token + last_hidden_state = last_hidden_state.permute(0, 2, 1) + + global_features = self.global_neck(global_features) + + ic(last_hidden_state.shape) + ic(global_features.shape) + + + last_hidden_state = last_hidden_state.reshape_as(global_features) + last_hidden_state = last_hidden_state + global_features + fov_output = self.head(last_hidden_state) + fov_output = fov_output.reshape(1) if not return_dict: - outputs = (last_hidden_state, fov, hidden_states, attentions) - outputs = (i for i in outputs if i is not None) - return outputs + head_outputs = (fov_output,) + return head_outputs + encoder_outputs[1:] - return DepthProModelOutput( - last_hidden_state=last_hidden_state, - fov=fov, - hidden_states=hidden_states, - attentions=attentions, + return BaseModelOutput( + last_hidden_state=fov_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, ) @@ -1422,7 +1303,6 @@ def __init__(self, config): nn.ReLU(), ) - def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: predicted_depth = self.head(hidden_states) predicted_depth = predicted_depth.squeeze(dim=1) @@ -1450,14 +1330,45 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): class DepthProForDepthEstimation(DepthProPreTrainedModel): def __init__(self, config, use_fov_model=None): super().__init__(config) + self.config = config self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model) + # dinov2 (vit) like encoders + self.depth_pro = DepthProModel(config) + + # project hidden states from encoder to match expected inputs in fusion stage + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + # dpt (vit) like fusion stage + self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers) + + # depth estimation head self.head = DepthProDepthEstimationHead(config) + # dinov2 (vit) like encoder + self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None + # Initialize weights and apply final processing self.post_init() + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1476,6 +1387,7 @@ def forward( Returns: Examples: + TODO ```python >>> from transformers import AutoImageProcessor, DPTForDepthEstimation >>> import torch @@ -1526,21 +1438,39 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, ) - last_hidden_state = depth_pro_outputs[0] - ic(last_hidden_state.shape) - predicted_depth = self.head(last_hidden_state) - ic(predicted_depth.shape) + last_hidden_state = depth_pro_outputs.last_hidden_state + last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)] + fused_state = self.fusion_stage(last_hidden_state) + predicted_depth = self.head(fused_state) + + if self.use_fov_model: + # use lowest scaled image features for fov model + global_features = last_hidden_state[0].detach() + fov_encodings = self.fov_model( + pixel_values=pixel_values, + global_features=global_features, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + fov = fov_encodings.last_hidden_state + attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + else: + fov = None + attentions = depth_pro_outputs.attentions + hidden_states = depth_pro_outputs.hidden_states if not return_dict: - if loss is None: - return (predicted_depth,) + depth_pro_outputs[1:] - else: - return (loss, predicted_depth) + depth_pro_outputs[1:] + outputs = (predicted_depth, fov, hidden_states, attentions) + outputs = (i for i in outputs if i is not None) + return outputs return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, - fov=depth_pro_outputs.fov, - hidden_states=depth_pro_outputs.hidden_states, - attentions=depth_pro_outputs.attentions, + fov=fov, + hidden_states=hidden_states, + attentions=attentions, ) From 6be242ce30589132e71bd437fd6016827c3d8b6a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 13:51:45 +0500 Subject: [PATCH 15/72] make fov model compatible with custom config --- .../depth_pro/configuration_depth_pro.py | 2 + .../models/depth_pro/modeling_depth_pro.py | 267 ++++++++++-------- 2 files changed, 150 insertions(+), 119 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 0558309004171f..8e197dbd0dab41 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -138,6 +138,7 @@ def __init__( scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_decoder=False, use_fov_model=False, + num_fov_head_layers=2, **kwargs, ): super().__init__(**kwargs) @@ -168,6 +169,7 @@ def __init__( self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids self.intermediate_feature_dims = intermediate_feature_dims self.scaled_images_ratios = scaled_images_ratios diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 9f146177402c00..0ddd503c4cc94a 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -610,6 +610,97 @@ def forward(self, features): projected = self.proj(features) return self.upsample_blocks(projected) + +def interpolate(pixel_values, scale_factor): + return nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=scale_factor, + mode="bilinear", + align_corners=False, + ) + +def patch(pixel_values, patch_size, overlap_ratio): + """Creates Patches from Batch.""" + B, C, W, H = pixel_values.shape + + if W == H == patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + + stride = int(patch_size * (1 - overlap_ratio)) + + # (B, C, W, H) + patches = torch.nn.functional.unfold( + pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) + ) + # patches.shape (B, patch_size**2 * C, num_patches) + patches = patches.permute(2, 0, 1) + # patches.shape (num_patches, B, patch_size**2 * C) + patches = patches.reshape(-1, C, patch_size, patch_size) + # patches.shape (B * num_patches, C, patch_size, patch_size) + + return patches + +def reshape_feature(hidden_states, width, height): + """Discard class token and reshape 1D feature map to a 2D grid.""" + B, _, C = hidden_states.shape + # (B, WH+1, C) + hidden_states = hidden_states[:, 1:, :] # remove class token + # (B, WH, C) + hidden_states = hidden_states.reshape(B, width, height, C) + # (B, W, H, C) + hidden_states = hidden_states.permute(0, 3, 1, 2) + # (B, C, W, H) + return hidden_states + +def merge(patches, batch_size, merge_out_size): + """Recreates Batch from Patches.""" + num_patches, num_channels, out_size, out_size = patches.shape + + if num_patches == batch_size: + # merge only if the patches were created from scaled image + # patches are not created when scaled image size is equal to patch size + return patches + + box_size = int(math.sqrt(num_patches // batch_size)) + """ + merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) + padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) + """ + padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 ) + + i = 0 + boxes = [] + for h in range(box_size): + boxes_in_row = [] + for w in range(box_size): + box = patches[batch_size * i : batch_size * (i + 1)] + + if h != 0: + # remove pad from height if box is not at top border + box = box[..., padding:, :] + if w != 0: + # remove pad from width if box is not at left border + box = box[..., :, padding:] + if h != box_size - 1: + # remove pad from height if box is not at bottom border + box = box[..., :box.shape[-2]-padding, :] + if w != box_size - 1: + # remove pad from width if box is not at right border + box = box[..., :, :box.shape[-1]-padding] + + boxes_in_row.append(box) + i += 1 + + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + + boxes = torch.cat(boxes, dim=-2) + boxes = boxes[..., :merge_out_size, :merge_out_size] + return boxes + + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -700,96 +791,6 @@ def __init__(self, config: DepthProConfig) -> None: bias=True, ) - def _interpolate(self, pixel_values, scale_factor): - if scale_factor == 1: - return pixel_values - return nn.functional.interpolate( - pixel_values, - size=None, - scale_factor=scale_factor, - mode="bilinear", - align_corners=False, - ) - - def _patch(self, pixel_values, overlap_ratio): - if pixel_values.shape[-1] == self.config.patch_size: - # create patches only if scaled image is not already equal to patch size - return pixel_values - - patch_size = self.config.patch_size - stride = int(patch_size * (1 - overlap_ratio)) - - # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) - patches = torch.nn.functional.unfold( - pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) - ) - # patches.shape (B, -1, num_patches) - patches = patches.permute(2, 0, 1) - # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size) - # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size) - - return patches - - def _reshape_feature( - self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 - ): - """Discard class token and reshape 1D feature map to a 2D grid.""" - b, hw, c = hidden_states.shape - - # Remove class token. - if cls_token_offset > 0: - hidden_states = hidden_states[:, cls_token_offset:, :] - - # Shape: (batch, height, width, dim) -> (batch, dim, height, width) - hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) - return hidden_states - - def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: - if batch_size == x.shape[0]: - # merge only if the patches were created from this scaled image - # pathces are not created when scaled image size is equal to patch size - return x - - # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) - box_size = int(math.sqrt(x.shape[0] // batch_size)) - - """ - merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) - padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) - """ - padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 ) - - i = 0 - boxes = [] - for h in range(box_size): - boxes_in_row = [] - for w in range(box_size): - box = x[batch_size * i : batch_size * (i + 1)] - - if h != 0: - # remove pad from height if box is not at top border - box = box[..., padding:, :] - if w != 0: - # remove pad from width if box is not at left border - box = box[..., :, padding:] - if h != box_size - 1: - # remove pad from height if box is not at bottom border - box = box[..., :box.shape[-2]-padding, :] - if w != box_size - 1: - # remove pad from width if box is not at right border - box = box[..., :, :box.shape[-1]-padding] - - boxes_in_row.append(box) - i += 1 - - boxes_in_row = torch.cat(boxes_in_row, dim=-1) - boxes.append(boxes_in_row) - - boxes = torch.cat(boxes, dim=-2) - boxes = boxes[..., :merge_out_size, :merge_out_size] - return boxes - def forward( self, pixel_values: torch.Tensor, @@ -825,14 +826,15 @@ def forward( scaled_images = [] for ratio in self.scaled_images_ratios: - scaled_images.append(self._interpolate(pixel_values, ratio)) + scaled_images.append(interpolate(pixel_values, ratio)) # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio) # STEP 2: create patches for i in range(self.n_scaled_images): - scaled_images[i] = self._patch( + scaled_images[i] = patch( scaled_images[i], + patch_size=self.config.patch_size, overlap_ratio=self.scaled_images_overlap_ratios[i], ) scaled_images_num_patches = [len(i) for i in scaled_images] @@ -870,12 +872,12 @@ def forward( # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size) # b. reshape back to image like - features = self._reshape_feature( + features = reshape_feature( hidden_state, self.out_size, self.out_size ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) # c. merge patches back together - features = self._merge( + features = merge( features, batch_size=B, merge_out_size=self.out_size*2**i ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) @@ -897,14 +899,14 @@ def forward( # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like - features = self._reshape_feature( + features = reshape_feature( hidden_state, self.out_size, self.out_size, ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - features = self._merge( + features = merge( features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) @@ -920,12 +922,12 @@ def forward( hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like - image_features = self._reshape_feature( + image_features = reshape_feature( hidden_state, self.out_size, self.out_size ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = self._merge( + image_features = merge( image_features, batch_size=B, merge_out_size=self.out_size*2**(0), ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) @@ -1206,18 +1208,39 @@ def __init__(self, config: DepthProConfig) -> None: self.hidden_size = config.hidden_size self.decoder_hidden_size = config.decoder_hidden_size + self.out_size = config.patch_size // config.patch_embeddings_size + self.encoder = DepthProViT(config) self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) self.global_neck = nn.Sequential( nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) - self.head = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + + if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0: + raise ValueError( + f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " + "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0" + ) + + # create initial head layers + self.head = nn.Sequential() + for i in range(config.num_fov_head_layers): + self.head.append( + nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + ) + self.head.append(nn.ReLU(True)) + # calculate expected shapes to finally generate a scalar output from final head layer + final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1) + final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + self.head.append( + nn.Conv2d( + in_channels=final_in_channels, + out_channels=1, + kernel_size=final_kernal_size, + stride=1, + padding=0 + ) ) def forward( @@ -1235,34 +1258,40 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - pixel_values = nn.functional.interpolate( + B, C, W, H = pixel_values.shape + + # follow the steps same as with image features in DepthProEncoder + pixel_values = interpolate( pixel_values, - size=None, - scale_factor=0.25, - mode="bilinear", - align_corners=False, + scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image ) - encoder_outputs = self.encoder( + patches = patch( pixel_values, + patch_size=self.config.patch_size, + overlap_ratio=self.config.scaled_images_overlap_ratios[0], + ) + encoder_outputs = self.encoder( + patches, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] - last_hidden_state = self.encoder_neck(last_hidden_state) - - last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token - last_hidden_state = last_hidden_state.permute(0, 2, 1) + last_hidden_state = reshape_feature( + last_hidden_state, + width=self.out_size, + height=self.out_size + ) + last_hidden_state = merge( + last_hidden_state, + batch_size=B, + merge_out_size=self.out_size, + ) global_features = self.global_neck(global_features) - ic(last_hidden_state.shape) - ic(global_features.shape) - - - last_hidden_state = last_hidden_state.reshape_as(global_features) last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(1) From 01891085f0961ea28049616abed63a8bd9cb2f05 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 13:54:43 +0500 Subject: [PATCH 16/72] replace word "decoder" with "fusion" --- .../depth_pro/configuration_depth_pro.py | 10 ++--- .../models/depth_pro/modeling_depth_pro.py | 44 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 8e197dbd0dab41..f124d3e5b71ab7 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -59,7 +59,7 @@ class DepthProConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. image_size (`int`, *optional*, defaults to 224): - TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size + TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size The size (resolution) of each image. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. @@ -110,7 +110,7 @@ class DepthProConfig(PretrainedConfig): def __init__( self, hidden_size=1024, - decoder_hidden_size=256, + fusion_hidden_size=256, num_hidden_layers=24, num_attention_heads=16, mlp_ratio=4, @@ -136,7 +136,7 @@ def __init__( scaled_images_ratios = [0.25, 0.5, 1], scaled_images_overlap_ratios = [0.0, 0.5, 0.25], scaled_images_feature_dims = [1024, 1024, 512], - use_batch_norm_in_decoder=False, + use_batch_norm_in_fusion=False, use_fov_model=False, num_fov_head_layers=2, **kwargs, @@ -144,7 +144,7 @@ def __init__( super().__init__(**kwargs) self.hidden_size = hidden_size - self.decoder_hidden_size = decoder_hidden_size + self.fusion_hidden_size = fusion_hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.mlp_ratio = mlp_ratio @@ -167,7 +167,7 @@ def __init__( ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.use_batch_norm_in_decoder = use_batch_norm_in_decoder + self.use_batch_norm_in_fusion = use_batch_norm_in_fusion self.use_fov_model = use_fov_model self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 0ddd503c4cc94a..0ac35b582d7fca 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -706,7 +706,7 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.fusion_hidden_size = config.fusion_hidden_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -762,7 +762,7 @@ def __init__(self, config: DepthProConfig) -> None: # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() for i, feature_dims in enumerate(self.intermediate_feature_dims): - intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims + intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims upsample_block = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=intermediate_dims, @@ -939,7 +939,7 @@ def forward( scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1) scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) - # STEP 8: return these features in order of increasing size as what decoder expects + # STEP 8: return these features in order of increasing size as what fusion expects last_hidden_state = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) *scaled_images_features, @@ -1094,8 +1094,8 @@ class DepthProResidualLayer(nn.Module): def __init__(self, config): super().__init__() - self.use_batch_norm = config.use_batch_norm_in_decoder - self.hidden_size = config.decoder_hidden_size + self.use_batch_norm = config.use_batch_norm_in_fusion + self.hidden_size = config.fusion_hidden_size self.activation1 = nn.ReLU() self.convolution1 = nn.Conv2d( @@ -1151,15 +1151,15 @@ def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: if self.use_deconv: self.deconv = nn.ConvTranspose2d( - in_channels=config.decoder_hidden_size, - out_channels=config.decoder_hidden_size, + in_channels=config.fusion_hidden_size, + out_channels=config.fusion_hidden_size, kernel_size=2, stride=2, padding=0, bias=False, ) - self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True) + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) self.skip_add = nn.quantized.FloatFunctional() def forward(self, hidden_state, residual=None): @@ -1206,32 +1206,32 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.fusion_hidden_size = config.fusion_hidden_size self.out_size = config.patch_size // config.patch_embeddings_size self.encoder = DepthProViT(config) - self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2) self.global_neck = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) - if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0: + if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0: raise ValueError( - f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " - "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0" + f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " + "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0" ) # create initial head layers self.head = nn.Sequential() for i in range(config.num_fov_head_layers): self.head.append( - nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1) + final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( @@ -1311,7 +1311,7 @@ class DepthProDepthEstimationHead(nn.Module): """ The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. This module comprises a sequence of convolutional and transposed convolutional layers - that process the feature map from the decoder to produce a single-channel depth map. + that process the feature map from the fusion to produce a single-channel depth map. Key operations include dimensionality reduction and upsampling to match the input resolution. """ @@ -1319,7 +1319,7 @@ def __init__(self, config): super().__init__() self.config = config - features = config.decoder_hidden_size + features = config.fusion_hidden_size self.head = nn.Sequential( nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( @@ -1369,14 +1369,14 @@ def __init__(self, config, use_fov_model=None): combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims self.projections = nn.ModuleList() for i, in_channels in enumerate(combined_feature_dims): - if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size: + if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size: # projection for last layer can be ignored if input and output channels already match self.projections.append(nn.Identity()) else: self.projections.append( nn.Conv2d( in_channels=in_channels, - out_channels=config.decoder_hidden_size, + out_channels=config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, @@ -1385,8 +1385,8 @@ def __init__(self, config, use_fov_model=None): ) # dpt (vit) like fusion stage - self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) - self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers) + self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers) # depth estimation head self.head = DepthProDepthEstimationHead(config) From 7614e1a709c14c8f9e32730fe240e401ae023ec3 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 24 Nov 2024 13:57:36 +0500 Subject: [PATCH 17/72] weight conversion script --- .../depth_pro/convert_depth_pro_to_hf.py | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py new file mode 100644 index 00000000000000..38b7a7853d76d6 --- /dev/null +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DepthPro checkpoints from the original repository. + +URL: https://huggingface.co/apple/DepthPro/tree/main +""" + +import argparse +import json +from pathlib import Path +import re + +import requests +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.utils import logging + +# TODO: import directly from transformers +from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig +from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def create_vit_rename_keys(config): + rename_keys = [] + # fmt: off + + # patch embedding layer + rename_keys.append(("cls_token", "embeddings.cls_token")) + rename_keys.append(("pos_embed", "embeddings.position_embeddings")) + rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) + + for i in range(config.num_hidden_layers): + # layernorms + rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) + # MLP + if config.use_swiglu_ffn: + rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) + rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) + rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) + rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) + else: + rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) + # layerscale + rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) + # attention projection layer + rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) + + # final layernorm + rename_keys.append(("norm.weight", "layernorm.weight")) + rename_keys.append(("norm.bias", "layernorm.bias")) + + # fmt: on + return rename_keys + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + state_dict_keys = state_dict.keys() + for key in list(state_dict_keys): + if "qkv" in key: + in_proj = state_dict.pop(key) + q, k, v = torch.split(in_proj, config.hidden_size, dim=0) + + if "fov" in key: + key = key.replace('fov.encoder.0', 'fov_model.encoder') + else: + key = "depth_pro." + key + + key = key.replace("blocks", "encoder.layer") + state_dict[key.replace("attn.qkv", "attention.attention.query")] = q + state_dict[key.replace("attn.qkv", "attention.attention.key")] = k + state_dict[key.replace("attn.qkv", "attention.attention.value")] = v + return state_dict + +# hard coded upsample keys +def update_hard_coded_keys(state_dict): + mapping = [ + # upsamples + ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'), + ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'), + ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'), + ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'), + ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'), + ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'), + ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'), + ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'), + ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'), + ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'), + ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'), + ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'), + ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'), + ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'), + ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'), + + # neck + ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"), + ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"), + ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"), + ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"), + ] + for src, dest in mapping: + state_dict[dest] = state_dict.pop(src) + + return state_dict + + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + + +@torch.no_grad() +def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our DepthPro structure. + """ + + # define default DepthPro configuration + config = DepthProConfig() + + # load original weights from huggingface hub + # TODO: download from hub + # file_path = hf_hub_download(repo_id, filename) + file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" + state_dict = torch.load(file_path, weights_only=True) + + # enumerate fusion layers + n_scaled_images = len(config.scaled_images_ratios) # 3 + n_intermediate_hooks = len(config.intermediate_hook_ids) # 2 + n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5 + + # 1. keys for vit encoders + vit_rename_keys = create_vit_rename_keys(config) + for src_prefix, dest_prefix in [ + ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"), + ("encoder.image_encoder", "depth_pro.encoder.image_encoder"), + ("fov.encoder.0", "fov_model.encoder"), + ]: + for src, dest in vit_rename_keys: + src = src_prefix + "." + src + dest = dest_prefix + "." + dest + state_dict[dest] = state_dict.pop(src) + + # 2. qkv keys for vit encoders + state_dict = read_in_q_k_v(state_dict, config) + + # 3. hard coded mapping + state_dict = update_hard_coded_keys(state_dict) + + + for key in list(state_dict.keys()): + + # 4. final depth estimation head + if key.startswith("head."): + new_key = "head." + key + + # 5. fov model head + elif key.startswith("fov.head."): + new_key = key.replace("fov", 'fov_model') + + # 6. projections between encoder and fusion + elif "decoder.convs." in key: + n = re.findall(r'\d+', key)[0] # find digit inside string + n = n_fusion_layers - int(n) - 1 + new_key = f"projections.{n}.weight" + + # 7. fuse low res with image features + elif "encoder.fuse_lowres." in key: + new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res") + + # 8. fusion stage (decoder) + elif key.startswith("decoder.fusions."): + new_key = key.replace("decoder.fusions.", "fusion_stage.layers.") + new_key = new_key.replace("resnet1", "residual_layer1") + new_key = new_key.replace("resnet2", "residual_layer2") + new_key = new_key.replace("residual.1", "convolution1") + new_key = new_key.replace("residual.3", "convolution2") + new_key = new_key.replace("out_conv", "projection") + + n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by . + n = n_with_dots[1:-1] + n = n_fusion_layers - int(n) - 1 + new_key = new_key.replace(n_with_dots, f".{n}.") + + else: + continue + + state_dict[new_key] = state_dict.pop(key) + + model = DepthProForDepthEstimation(config, use_fov_model=True).eval() + model.load_state_dict(state_dict) + + exit() + + # ---------------- + + + + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if "w12" in key: + key = key.replace("w12", "weights_in") + if "w3" in key: + key = key.replace("w3", "weights_out") + state_dict[key] = val + + # load HuggingFace model + if image_classifier: + model = Dinov2ForImageClassification(config).eval() + model.dinov2.load_state_dict(state_dict) + model_name_to_classifier_dict_url = { + "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", + "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", + "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", + "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", + } + url = model_name_to_classifier_dict_url[model_name] + classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") + model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) + model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) + else: + model = Dinov2Model(config).eval() + model.load_state_dict(state_dict) + + # load image + image = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values + std=IMAGENET_DEFAULT_STD, # across a large photo dataset. + ), + ] + ) + + original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + + processor = BitImageProcessor( + size={"shortest_edge": 256}, + resample=PILImageResampling.BICUBIC, + image_mean=IMAGENET_DEFAULT_MEAN, + image_std=IMAGENET_DEFAULT_STD, + ) + pixel_values = processor(image, return_tensors="pt").pixel_values + + assert torch.allclose(original_pixel_values, pixel_values) + + with torch.no_grad(): + outputs = model(pixel_values, output_hidden_states=True) + original_outputs = original_model(pixel_values) + + # assert values + if image_classifier: + print("Predicted class:") + class_idx = outputs.logits.argmax(-1).item() + print(model.config.id2label[class_idx]) + else: + assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape + assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model_name_to_hf_name = { + "dinov2_vits14": "dinov2-small", + "dinov2_vitb14": "dinov2-base", + "dinov2_vitl14": "dinov2-large", + "dinov2_vitg14": "dinov2-giant", + "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", + "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", + "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", + "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", + } + + name = model_name_to_hf_name[model_name] + model.push_to_hub(f"facebook/{name}") + processor.push_to_hub(f"facebook/{name}") + + +convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False) +exit() +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." + ) + parser.add_argument( + "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) From 7d323ce91f071cc5ed6b0c36f407866e545dbe65 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:41:13 +0500 Subject: [PATCH 18/72] fix fov squeeze --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 0ac35b582d7fca..eb8bf02f83d160 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1294,7 +1294,7 @@ def forward( last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) - fov_output = fov_output.reshape(1) + fov_output = fov_output.reshape(B) if not return_dict: head_outputs = (fov_output,) From 6aaa59e943c5d5fd5c301404aaa47e8db1402355 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:42:18 +0500 Subject: [PATCH 19/72] update conversion script (without test) --- .../depth_pro/convert_depth_pro_to_hf.py | 160 +++++++----------- 1 file changed, 59 insertions(+), 101 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py index 38b7a7853d76d6..de7bf395a35552 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -83,6 +83,7 @@ def create_vit_rename_keys(config): # fmt: on return rename_keys + # we split up the matrix of each encoder layer into queries, keys and values def read_in_q_k_v(state_dict, config): state_dict_keys = state_dict.keys() @@ -102,6 +103,7 @@ def read_in_q_k_v(state_dict, config): state_dict[key.replace("attn.qkv", "attention.attention.value")] = v return state_dict + # hard coded upsample keys def update_hard_coded_keys(state_dict): mapping = [ @@ -134,13 +136,24 @@ def update_hard_coded_keys(state_dict): return state_dict - # We will verify our results on an image of cute cats -def prepare_img(): +def inference_test(processor, model): url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image + inputs = processor(image) + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = outputs.predicted_depth + fov = outputs.fov + + predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov) + + print("predicted_depth.shape:", predicted_depth.shape) + print("fov.shape:", fov.shape) + print("fov:", fov) + print("Inference was Successfull!") @torch.no_grad() @@ -150,12 +163,10 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu """ # define default DepthPro configuration - config = DepthProConfig() + config = DepthProConfig(use_fov_model=True) # load original weights from huggingface hub - # TODO: download from hub - # file_path = hf_hub_download(repo_id, filename) - file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" + file_path = hf_hub_download(repo_id, filename) state_dict = torch.load(file_path, weights_only=True) # enumerate fusion layers @@ -224,108 +235,50 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu model = DepthProForDepthEstimation(config, use_fov_model=True).eval() model.load_state_dict(state_dict) - exit() - - # ---------------- + # TODO + processor = ... + # inference_test(processor, model) - + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + # TODO + # print(f"Saving image processor to {pytorch_dump_folder_path}") + # processor.save_pretrained(pytorch_dump_folder_path) - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2ForImageClassification(config).eval() - model.dinov2.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", - "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", - "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", - "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2Model(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + # TODO + # if push_to_hub: + # model.push_to_hub("...") + # processor.push_to_hub("...") - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - assert torch.allclose(original_pixel_values, pixel_values) +""" +- create files locally using function +```py +convert_depth_pro_checkpoint( + "apple/DepthPro", + "depth_pro.pt", + "my_local_dump", + False, +) +``` + +- create files locally using command line args +```cmd +python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ + --repo_id "apple/DepthPro" \ + --filename "depth_pro.pt" \ + --pytorch_dump_folder_path "my_local_dump" \ + --push_to_hub 0 +``` +""" - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14": "dinov2-small", - "dinov2_vitb14": "dinov2-base", - "dinov2_vitl14": "dinov2-large", - "dinov2_vitg14": "dinov2-giant", - "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", - "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", - "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", - "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"facebook/{name}") - processor.push_to_hub(f"facebook/{name}") - - -convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False) -exit() if __name__ == "__main__": parser = argparse.ArgumentParser() + # Required parameters parser.add_argument( "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." @@ -341,4 +294,9 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu ) args = parser.parse_args() - convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) + convert_depth_pro_checkpoint( + args.repo_id, + args.filename, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) From 263b773db7ac897a6a610e15a3fc5be0b79615da Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:47:17 +0500 Subject: [PATCH 20/72] upload ruff image processing --- .../depth_pro/image_processing_depth_pro.py | 397 ++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro.py diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py new file mode 100644 index 00000000000000..883c50ebfe6fbd --- /dev/null +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -0,0 +1,397 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DepthPro.""" + +from typing import Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np +from icecream import ic + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, logging + +import math +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import pad, resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + is_torch_tensor, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, + requires_backends, +) + +from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput + + +if is_torch_available(): + import torch + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DepthProImageProcessor(BaseImageProcessor): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + + # ic(image.dtype) + # ic(type(image)) + # ic(image.shape) + # ic(image.mean()) + # ic(image.std()) + # ic(image.min()) + # ic(image.max()) + # ic(output_size) + # ic(resample) + # ic(data_format) + # ic(input_data_format) + # # exit() + + # return torch.nn.functional.interpolate( + # input=torch.from_numpy(image), + # size=output_size, + # mode=resample, + # align_corners=True, + # ) + + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # TODO + # depth-pro image preprocessing scales the image before resizing it + + if do_resize: + images = [ + self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + predicted_depth, + fov=None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + self.size = { + 'width': 3024, + 'height': 2268, + } + W = self.size['width'] + H = self.size['height'] + + if (fov is not None) and (len(predicted_depth) != len(fov)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + output_depths = [] + output_fovs = None if fov is None else [] + fov = [None] * len(predicted_depth) if fov is None else fov + for depth, fov_value in zip(predicted_depth, fov): + + if fov_value is not None: + fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * W / fov_value + + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False + ).squeeze() + + if fov_value is not None: + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + output_fovs.append(fov_value) + + output_depths.append(depth) + + return output_depths, output_fovs From 17e5487ce6782998aaccb8a8799b9495d7d545bd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 09:35:52 +0500 Subject: [PATCH 21/72] create fast image processing --- .../image_processing_depth_pro_fast.py | 362 ++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro_fast.py diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py new file mode 100644 index 00000000000000..8860f2e86830c0 --- /dev/null +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -0,0 +1,362 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DepthPro.""" + +import functools +from typing import Dict, List, Optional, Union + +from ...image_processing_base import BatchFeature +from ...image_processing_utils import get_size_dict +from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_type, + make_list_of_images, + pil_torch_interpolation_mapping, +) +from ...utils import TensorType, logging, requires_backends +from ...utils.import_utils import is_torch_available, is_torchvision_available + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from torchvision.transforms import Compose, Normalize, PILToTensor, Resize + + +class DepthProImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + _transform_params = [ + "do_resize", + "do_rescale", + "do_normalize", + "size", + "resample", + "antialias", + "rescale_factor", + "image_mean", + "image_std", + "image_type", + ] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.antialias = antialias + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def _build_transforms( + self, + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + image_type: ImageType, + ) -> "Compose": + """ + Given the input settings build the image transforms using `torchvision.transforms.Compose`. + """ + transforms = [] + + # All PIL and numpy values need to be converted to a torch tensor + # to keep cross compatibility with slow image processors + if image_type == ImageType.PIL: + transforms.append(PILToTensor()) + + elif image_type == ImageType.NUMPY: + transforms.append(NumpyToTensor()) + + # We can combine rescale and normalize into a single operation for speed + if do_rescale and do_normalize: + transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor)) + elif do_rescale: + transforms.append(Rescale(rescale_factor=rescale_factor)) + elif do_normalize: + transforms.append(Normalize(image_mean, image_std)) + + # depth-pro scales the image before resizing it + if do_resize: + transforms.append( + Resize( + (size["height"], size["width"]), + interpolation=pil_torch_interpolation_mapping[resample], + antialias=antialias + ) + ) + + return Compose(transforms) + + @functools.lru_cache(maxsize=1) + def _validate_input_arguments( + self, + return_tensors: Union[str, TensorType], + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + data_format: Union[str, ChannelDimension], + image_type: ImageType, + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample, antialias): + raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + antialias: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Only "pt" is supported + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. The following formats are currently supported: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + antialias = antialias if antialias is not None else self.antialias + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + size = size if size is not None else self.size + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + antialias=antialias, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + image_type=image_type, + ) + + transforms = self.get_transforms( + do_resize=do_resize, + do_rescale=do_rescale, + do_normalize=do_normalize, + size=size, + resample=resample, + antialias=antialias, + rescale_factor=rescale_factor, + image_mean=image_mean, + image_std=image_std, + image_type=image_type, + ) + transformed_images = [transforms(image) for image in images] + + data = {"pixel_values": torch.stack(transformed_images, dim=0)} + return BatchFeature(data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + predicted_depth, + fov=None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + self.size = { + 'width': 3024, + 'height': 2268, + } + W = self.size['width'] + H = self.size['height'] + + if (fov is not None) and (len(predicted_depth) != len(fov)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + output_depths = [] + output_fovs = None if fov is None else [] + fov = [None] * len(predicted_depth) if fov is None else fov + for depth, fov_value in zip(predicted_depth, fov): + + if fov_value is not None: + fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * W / fov_value + + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False + ).squeeze() + + if fov_value is not None: + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + output_fovs.append(fov_value) + + output_depths.append(depth) + + return output_depths, output_fovs From a8dd7049a5e2683a06f8d8df4cb7d22673d35b4b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 10:42:36 +0500 Subject: [PATCH 22/72] use torch interpolation for image processing --- .../depth_pro/image_processing_depth_pro.py | 112 +++++++++++------- 1 file changed, 66 insertions(+), 46 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 883c50ebfe6fbd..d8b9ff493b1ab2 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,6 +14,7 @@ # limitations under the License. """Image processor class for DepthPro.""" +import functools from typing import Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union @@ -33,7 +34,7 @@ make_list_of_images, to_numpy_array, valid_images, - validate_preprocess_arguments, + pil_torch_interpolation_mapping, ) from ...utils import TensorType, filter_out_non_signature_kwargs, logging @@ -62,7 +63,6 @@ make_list_of_images, to_numpy_array, valid_images, - validate_preprocess_arguments, ) from ...utils import ( TensorType, @@ -99,6 +99,9 @@ class DepthProImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `True`): Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. @@ -123,6 +126,7 @@ def __init__( do_resize: bool = True, size: Optional[Dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -138,15 +142,17 @@ def __init__( self.do_normalize = do_normalize self.size = size self.resample = resample + self.antialias = antialias self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD def resize( self, - image: np.ndarray, + images: List[np.ndarray], size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -155,12 +161,15 @@ def resize( Resize an image to `(size["height"], size["width"])`. Args: - image (`np.ndarray`): - Image to resize. + images (`List[np.ndarray]`): + Images to resize. size (`Dict[str, int]`): Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -175,41 +184,49 @@ def resize( - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. Returns: - `np.ndarray`: The resized image. + `np.ndarray`: The resized images. """ size = get_size_dict(size) if "height" not in size or "width" not in size: raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - # ic(image.dtype) - # ic(type(image)) - # ic(image.shape) - # ic(image.mean()) - # ic(image.std()) - # ic(image.min()) - # ic(image.max()) - # ic(output_size) - # ic(resample) - # ic(data_format) - # ic(input_data_format) - # # exit() - - # return torch.nn.functional.interpolate( - # input=torch.from_numpy(image), - # size=output_size, - # mode=resample, - # align_corners=True, - # ) - - return resize( - image, + images = np.stack(images) + images = torch.from_numpy(images) + + return torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=images, size=output_size, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) + # mode=pil_torch_interpolation_mapping[resample], + mode="bilinear", + antialias=antialias, + ).numpy() + + def _validate_input_arguments( + self, + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + data_format: Union[str, ChannelDimension], + ): + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample, antialias): + raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") @filter_out_non_signature_kwargs() def preprocess( @@ -218,6 +235,7 @@ def preprocess( do_resize: Optional[bool] = None, size: Dict[str, int] = None, resample: PILImageResampling = None, + antialias: Optional[bool] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, @@ -242,6 +260,9 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -275,6 +296,7 @@ def preprocess( do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample + antialias = antialias if antialias is not None else self.antialias rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std @@ -289,15 +311,17 @@ def preprocess( "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "torch.Tensor, tf.Tensor or jax.ndarray." ) - validate_preprocess_arguments( + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + antialias=antialias, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, + data_format=data_format, ) # All transformations expect numpy arrays. @@ -313,15 +337,6 @@ def preprocess( # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) - # TODO - # depth-pro image preprocessing scales the image before resizing it - - if do_resize: - images = [ - self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) - for image in images - ] - if do_rescale: images = [ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) @@ -338,6 +353,11 @@ def preprocess( to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] + # depth-pro scales the image before resizing it + # uses torch interpolation which requires ChannelDimension.FIRST + if do_resize: + images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) + data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) From 261bbafe4fb65d3bfe344045d92c7ca67f05283f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 12:12:39 +0500 Subject: [PATCH 23/72] complete post_process_depth_estimation --- .../depth_pro/image_processing_depth_pro.py | 71 +++++++++++-------- .../image_processing_depth_pro_fast.py | 70 ++++++++++-------- 2 files changed, 83 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index d8b9ff493b1ab2..0a7313e2d19a43 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,13 +14,13 @@ # limitations under the License. """Image processor class for DepthPro.""" -import functools from typing import Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union import numpy as np from icecream import ic + from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( @@ -186,6 +186,8 @@ def resize( Returns: `np.ndarray`: The resized images. """ + requires_backends(self, "torch") + size = get_size_dict(size) if "height" not in size or "width" not in size: raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") @@ -198,10 +200,9 @@ def resize( # input should be (B, C, H, W) input=images, size=output_size, - # mode=pil_torch_interpolation_mapping[resample], - mode="bilinear", + mode=pil_torch_interpolation_mapping[resample].value, antialias=antialias, - ).numpy() + ) def _validate_input_arguments( self, @@ -357,14 +358,16 @@ def preprocess( # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) + images = images.numpy() data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) def post_process_depth_estimation( self, - predicted_depth, - fov=None, + predicted_depths, + fovs=None, + target_sizes=None, ) -> List[Dict[str, TensorType]]: """ Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. @@ -383,35 +386,45 @@ def post_process_depth_estimation( """ requires_backends(self, "torch") - self.size = { - 'width': 3024, - 'height': 2268, - } - W = self.size['width'] - H = self.size['height'] - - if (fov is not None) and (len(predicted_depth) != len(fov)): + if (fovs is not None) and (len(predicted_depths) != len(fovs)): raise ValueError( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) + if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + outputs = { + "predicted_depth": [], + "fov": [] if fovs is not None else None + } + + fovs = [None] * len(predicted_depths) if fovs is None else fovs + target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes + + for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - output_depths = [] - output_fovs = None if fov is None else [] - fov = [None] * len(predicted_depth) if fov is None else fov - for depth, fov_value in zip(predicted_depth, fov): + if target_size is not None: - if fov_value is not None: - fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * W / fov_value + # scale image w.r.t fov + if fov is not None: + width = target_size[1] + fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov)) + predicted_depth = predicted_depth * width / fov + outputs["fov"].append(fov) - depth = torch.nn.functional.interpolate( - depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False - ).squeeze() + # interpolate + predicted_depth = self.resize( + predicted_depth.unsqueeze(0).unsqueeze(1), + size=target_size, + resample=self.resample, + antialias=self.antialias + ).squeeze() - if fov_value is not None: - depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) - output_fovs.append(fov_value) + # inverse the depth + predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4) - output_depths.append(depth) + outputs["predicted_depth"].append(predicted_depth) - return output_depths, output_fovs + return outputs diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 8860f2e86830c0..38d699452e443a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -154,7 +154,7 @@ def _build_transforms( elif do_normalize: transforms.append(Normalize(image_mean, image_std)) - # depth-pro scales the image before resizing it + # depth-pro scales the image before resizing it if do_resize: transforms.append( Resize( @@ -229,9 +229,9 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -308,8 +308,9 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depth, - fov=None, + predicted_depths, + fovs=None, + target_sizes=None, ) -> List[Dict[str, TensorType]]: """ Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. @@ -328,35 +329,46 @@ def post_process_depth_estimation( """ requires_backends(self, "torch") - self.size = { - 'width': 3024, - 'height': 2268, - } - W = self.size['width'] - H = self.size['height'] - - if (fov is not None) and (len(predicted_depth) != len(fov)): + if (fovs is not None) and (len(predicted_depths) != len(fovs)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)): raise ValueError( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - output_depths = [] - output_fovs = None if fov is None else [] - fov = [None] * len(predicted_depth) if fov is None else fov - for depth, fov_value in zip(predicted_depth, fov): + outputs = { + "predicted_depth": [], + "fov": [] if fovs is not None else None + } + + fovs = [None] * len(predicted_depths) if fovs is None else fovs + target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes + + for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): + + if target_size is not None: - if fov_value is not None: - fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * W / fov_value + # scale image w.r.t fov + if fov is not None: + width = target_size[1] + fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov)) + predicted_depth = predicted_depth * width / fov + outputs["fov"].append(fov) - depth = torch.nn.functional.interpolate( - depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False - ).squeeze() + # interpolate + predicted_depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=predicted_depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + antialias=self.antialias, + ).squeeze() - if fov_value is not None: - depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) - output_fovs.append(fov_value) + # inverse the depth + predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4) - output_depths.append(depth) + outputs["predicted_depth"].append(predicted_depth) - return output_depths, output_fovs + return outputs From a4b3556c5f7ef738048df1b7de22dfa45c822b43 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 16:36:19 +0500 Subject: [PATCH 24/72] config: fix imports and sort args --- .../depth_pro/configuration_depth_pro.py | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index f124d3e5b71ab7..fae3e84432be22 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -14,15 +14,8 @@ # limitations under the License. """DepthPro model configuration""" -from collections import OrderedDict -from typing import Mapping - -from packaging import version - -from transformers.configuration_utils import PretrainedConfig -from transformers.onnx import OnnxConfig -from transformers.utils import logging -from transformers.utils.backbone_utils import get_aligned_output_features_output_indices +from ...configuration_utils import PretrainedConfig +from ...utils import logging logger = logging.get_logger(__name__) @@ -41,6 +34,8 @@ class DepthProConfig(PretrainedConfig): Args: hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. + fusion_hidden_size + TODO num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): @@ -65,6 +60,8 @@ class DepthProConfig(PretrainedConfig): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. + patch_embeddings_size + TODO qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): @@ -73,22 +70,28 @@ class DepthProConfig(PretrainedConfig): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): Whether to use the SwiGLU feedforward neural network. - out_features (`List[str]`, *optional*): - If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. - (depending on how many stages the model has). If unset and `out_indices` is set, will default to the - corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. - out_indices (`List[int]`, *optional*): - If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how - many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. - If unset and `out_features` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. apply_layernorm (`bool`, *optional*, defaults to `True`): Whether to apply layer normalization to the feature maps in case the model is used as backbone. reshape_hidden_states (`bool`, *optional*, defaults to `True`): Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. + intermediate_hook_ids + TODO + intermediate_feature_dims + TODO + scaled_images_ratios + TODO + scaled_images_overlap_ratios + TODO + scaled_images_feature_dims + TODO + use_batch_norm_in_fusion + TODO + use_fov_model + TODO + num_fov_head_layers + TODO Example: @@ -127,8 +130,6 @@ def __init__( layerscale_value=1.0, drop_path_rate=0.0, use_swiglu_ffn=False, - out_features=None, - out_indices=None, apply_layernorm=True, reshape_hidden_states=True, intermediate_hook_ids = [11, 5], @@ -137,7 +138,7 @@ def __init__( scaled_images_overlap_ratios = [0.0, 0.5, 0.25], scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_fusion=False, - use_fov_model=False, + use_fov_model=True, num_fov_head_layers=2, **kwargs, ): @@ -161,10 +162,6 @@ def __init__( self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate self.use_swiglu_ffn = use_swiglu_ffn - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self._out_features, self._out_indices = get_aligned_output_features_output_indices( - out_features=out_features, out_indices=out_indices, stage_names=self.stage_names - ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_fusion = use_batch_norm_in_fusion From f13c63208caec6b70a9d8660a42d92ec4c18af3a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 16:51:12 +0500 Subject: [PATCH 25/72] apply inference in weight conversion --- .../depth_pro/convert_depth_pro_to_hf.py | 63 ++++++++++++------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py index de7bf395a35552..7b4552c508fffe 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -18,24 +18,22 @@ """ import argparse -import json from pathlib import Path import re import requests import torch -import torch.nn as nn from huggingface_hub import hf_hub_download from PIL import Image -from torchvision import transforms -from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.image_utils import PILImageResampling from transformers.utils import logging +# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation # TODO: import directly from transformers from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation +from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast logging.set_verbosity_info() @@ -147,13 +145,21 @@ def inference_test(processor, model): predicted_depth = outputs.predicted_depth fov = outputs.fov + target_sizes = [[image.height, image.width]] * len(predicted_depth) - predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov) + outputs = processor.post_process_depth_estimation( + predicted_depths=predicted_depth, + fovs=fov, + target_sizes=target_sizes, + ) + predicted_depth = outputs['predicted_depth'] + fov = outputs['fov'] - print("predicted_depth.shape:", predicted_depth.shape) - print("fov.shape:", fov.shape) + print("\nInference ...") + print("predicted_depth:", predicted_depth) + print("predicted_depth[0].shape:", predicted_depth[0].shape) print("fov:", fov) - print("Inference was Successfull!") + print("Inference was Successfull!\n") @torch.no_grad() @@ -167,6 +173,7 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu # load original weights from huggingface hub file_path = hf_hub_download(repo_id, filename) + # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" state_dict = torch.load(file_path, weights_only=True) # enumerate fusion layers @@ -235,23 +242,31 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu model = DepthProForDepthEstimation(config, use_fov_model=True).eval() model.load_state_dict(state_dict) - # TODO - processor = ... - # inference_test(processor, model) + processor = DepthProImageProcessorFast( + do_resize = True, + size = {"height": 1536, "width": 1536}, + resample = PILImageResampling.BILINEAR, + antialias = False, + do_rescale = True, + rescale_factor = 1 / 255, + do_normalize = True, + image_mean = 0.5, + image_std = 0.5, + return_tensors = "pt", + ) + inference_test(processor, model) if pytorch_dump_folder_path is not None: Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) - # TODO - # print(f"Saving image processor to {pytorch_dump_folder_path}") - # processor.save_pretrained(pytorch_dump_folder_path) - + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) - # TODO - # if push_to_hub: - # model.push_to_hub("...") - # processor.push_to_hub("...") + if push_to_hub: + hub_path = "geetu040/DepthPro" + model.push_to_hub(hub_path) + processor.push_to_hub(hub_path) """ @@ -260,8 +275,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu convert_depth_pro_checkpoint( "apple/DepthPro", "depth_pro.pt", - "my_local_dump", - False, + "my_local_depth_pro_dump", + True, ) ``` @@ -270,8 +285,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ --repo_id "apple/DepthPro" \ --filename "depth_pro.pt" \ - --pytorch_dump_folder_path "my_local_dump" \ - --push_to_hub 0 + --pytorch_dump_folder_path "my_local_depth_pro_dump" \ + --push_to_hub ``` """ From 387ddd8c7e50f419d1abcd5a61cd48ea23e0d626 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 10:55:18 +0500 Subject: [PATCH 26/72] use mllama script instead for weight conversion --- .../depth_pro/convert_depth_pro_to_hf.py | 317 ------------------ .../convert_depth_pro_weights_to_hf.py | 255 ++++++++++++++ 2 files changed, 255 insertions(+), 317 deletions(-) delete mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py deleted file mode 100644 index 7b4552c508fffe..00000000000000 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ /dev/null @@ -1,317 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DepthPro checkpoints from the original repository. - -URL: https://huggingface.co/apple/DepthPro/tree/main -""" - -import argparse -from pathlib import Path -import re - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - -# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation -# TODO: import directly from transformers -from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig -from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation -from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def create_vit_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - state_dict_keys = state_dict.keys() - for key in list(state_dict_keys): - if "qkv" in key: - in_proj = state_dict.pop(key) - q, k, v = torch.split(in_proj, config.hidden_size, dim=0) - - if "fov" in key: - key = key.replace('fov.encoder.0', 'fov_model.encoder') - else: - key = "depth_pro." + key - - key = key.replace("blocks", "encoder.layer") - state_dict[key.replace("attn.qkv", "attention.attention.query")] = q - state_dict[key.replace("attn.qkv", "attention.attention.key")] = k - state_dict[key.replace("attn.qkv", "attention.attention.value")] = v - return state_dict - - -# hard coded upsample keys -def update_hard_coded_keys(state_dict): - mapping = [ - # upsamples - ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'), - ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'), - ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'), - ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'), - ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'), - ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'), - ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'), - ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'), - ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'), - ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'), - ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'), - ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'), - ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'), - ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'), - ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'), - - # neck - ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"), - ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"), - ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"), - ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"), - ] - for src, dest in mapping: - state_dict[dest] = state_dict.pop(src) - - return state_dict - - -# We will verify our results on an image of cute cats -def inference_test(processor, model): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - inputs = processor(image) - with torch.no_grad(): - outputs = model(**inputs) - - predicted_depth = outputs.predicted_depth - fov = outputs.fov - target_sizes = [[image.height, image.width]] * len(predicted_depth) - - outputs = processor.post_process_depth_estimation( - predicted_depths=predicted_depth, - fovs=fov, - target_sizes=target_sizes, - ) - predicted_depth = outputs['predicted_depth'] - fov = outputs['fov'] - - print("\nInference ...") - print("predicted_depth:", predicted_depth) - print("predicted_depth[0].shape:", predicted_depth[0].shape) - print("fov:", fov) - print("Inference was Successfull!\n") - - -@torch.no_grad() -def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DepthPro structure. - """ - - # define default DepthPro configuration - config = DepthProConfig(use_fov_model=True) - - # load original weights from huggingface hub - file_path = hf_hub_download(repo_id, filename) - # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" - state_dict = torch.load(file_path, weights_only=True) - - # enumerate fusion layers - n_scaled_images = len(config.scaled_images_ratios) # 3 - n_intermediate_hooks = len(config.intermediate_hook_ids) # 2 - n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5 - - # 1. keys for vit encoders - vit_rename_keys = create_vit_rename_keys(config) - for src_prefix, dest_prefix in [ - ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"), - ("encoder.image_encoder", "depth_pro.encoder.image_encoder"), - ("fov.encoder.0", "fov_model.encoder"), - ]: - for src, dest in vit_rename_keys: - src = src_prefix + "." + src - dest = dest_prefix + "." + dest - state_dict[dest] = state_dict.pop(src) - - # 2. qkv keys for vit encoders - state_dict = read_in_q_k_v(state_dict, config) - - # 3. hard coded mapping - state_dict = update_hard_coded_keys(state_dict) - - - for key in list(state_dict.keys()): - - # 4. final depth estimation head - if key.startswith("head."): - new_key = "head." + key - - # 5. fov model head - elif key.startswith("fov.head."): - new_key = key.replace("fov", 'fov_model') - - # 6. projections between encoder and fusion - elif "decoder.convs." in key: - n = re.findall(r'\d+', key)[0] # find digit inside string - n = n_fusion_layers - int(n) - 1 - new_key = f"projections.{n}.weight" - - # 7. fuse low res with image features - elif "encoder.fuse_lowres." in key: - new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res") - - # 8. fusion stage (decoder) - elif key.startswith("decoder.fusions."): - new_key = key.replace("decoder.fusions.", "fusion_stage.layers.") - new_key = new_key.replace("resnet1", "residual_layer1") - new_key = new_key.replace("resnet2", "residual_layer2") - new_key = new_key.replace("residual.1", "convolution1") - new_key = new_key.replace("residual.3", "convolution2") - new_key = new_key.replace("out_conv", "projection") - - n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by . - n = n_with_dots[1:-1] - n = n_fusion_layers - int(n) - 1 - new_key = new_key.replace(n_with_dots, f".{n}.") - - else: - continue - - state_dict[new_key] = state_dict.pop(key) - - model = DepthProForDepthEstimation(config, use_fov_model=True).eval() - model.load_state_dict(state_dict) - - processor = DepthProImageProcessorFast( - do_resize = True, - size = {"height": 1536, "width": 1536}, - resample = PILImageResampling.BILINEAR, - antialias = False, - do_rescale = True, - rescale_factor = 1 / 255, - do_normalize = True, - image_mean = 0.5, - image_std = 0.5, - return_tensors = "pt", - ) - inference_test(processor, model) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - hub_path = "geetu040/DepthPro" - model.push_to_hub(hub_path) - processor.push_to_hub(hub_path) - - -""" -- create files locally using function -```py -convert_depth_pro_checkpoint( - "apple/DepthPro", - "depth_pro.pt", - "my_local_depth_pro_dump", - True, -) -``` - -- create files locally using command line args -```cmd -python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ - --repo_id "apple/DepthPro" \ - --filename "depth_pro.pt" \ - --pytorch_dump_folder_path "my_local_depth_pro_dump" \ - --push_to_hub -``` -""" - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." - ) - parser.add_argument( - "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_depth_pro_checkpoint( - args.repo_id, - args.filename, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py new file mode 100644 index 00000000000000..fe862d7469a1d3 --- /dev/null +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -0,0 +1,255 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os + +import regex as re +import torch +from huggingface_hub import hf_hub_download +from transformers.image_utils import PILImageResampling + +from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig +from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast +from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation + + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + + # patch_encoder/image_encoder (ViT based) + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", + + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + r"decoder.convs.(\d+).weight": lambda match: ( + f"projections.{4-int(match.group(1))}.weight" + ), + + # fov_model.encoder (ViT based) + r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", + + # fov head + r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", + r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", + r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", + + # fusion stage + r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" + ), + r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}" + ), + r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}" + ), + + # qkv attentions blocks + + # upsamples (hard coded; regex is not very feasible here) + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", + "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", +} +# fmt: on + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + return output_dict + +def get_qkv_state_dict(key, parameter): + qkv_state_dict = {} + placeholder = re.search(r'(\(.*?\))', key).group(1) + replacements_keys = placeholder[1:-1].split("|") + replacements_vals = torch.split( + parameter, + split_size_or_sections=parameter.size(0)//len(replacements_keys), + dim=0 + ) + for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): + qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val + return qkv_state_dict + +def write_model( + hf_repo_id: str, + output_dir: str, + safe_serialization: bool=True, +): + os.makedirs(output_dir, exist_ok=True) + + # ------------------------------------------------------------ + # Create and save config + # ------------------------------------------------------------ + + # create config + config = DepthProConfig( + # this config is same as the default config and used for pre-trained weights + hidden_size=1024, + fusion_hidden_size=256, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=1536, + patch_size=384, + num_channels=3, + patch_embeddings_size=16, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + apply_layernorm=True, + reshape_hidden_states=True, + intermediate_hook_ids = [11, 5], + intermediate_feature_dims = [256, 256], + scaled_images_ratios = [0.25, 0.5, 1], + scaled_images_overlap_ratios = [0.0, 0.5, 0.25], + scaled_images_feature_dims = [1024, 1024, 512], + use_batch_norm_in_fusion=False, + use_fov_model=True, + num_fov_head_layers=2, + ) + + # save config + config.save_pretrained(output_dir) + print("Model config saved successfully...") + + # ------------------------------------------------------------ + # Convert weights + # ------------------------------------------------------------ + + # downlaod and load state_dict from hf repo + file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") + # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally + loaded = torch.load(file_path, weights_only=True) + + print("Converting model...") + all_keys = list(loaded.keys()) + new_keys = convert_old_keys_to_new_keys(all_keys) + + state_dict = {} + for key in all_keys: + new_key = new_keys[key] + current_parameter = loaded.pop(key) + + if "qkv" in key: + qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) + state_dict.update(qkv_state_dict) + else: + state_dict[new_key] = current_parameter + + print("Loading the checkpoint in a DepthPro model.") + model = DepthProForDepthEstimation(config) + model.load_state_dict(state_dict, strict=True, assign=True) + print("Checkpoint loaded successfully.") + + print("Saving the model.") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + del state_dict, model + + # Safety check: reload the converted model + gc.collect() + print("Reloading the model to check if it's saved correctly.") + DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") + print("Model reloaded successfully.") + +def write_image_processor(output_dir: str): + image_processor = DepthProImageProcessorFast( + do_resize = True, + size = {"height": 1536, "width": 1536}, + resample = PILImageResampling.BILINEAR, + antialias = False, + do_rescale = True, + rescale_factor = 1 / 255, + do_normalize = True, + image_mean = 0.5, + image_std = 0.5, + return_tensors = "pt", + ) + image_processor.save_pretrained(output_dir) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + default="apple/DepthPro", + help="Location of official weights from apple on HF", + ) + parser.add_argument( + "--output_dir", + default="apple_DepthPro", + help="Location to write HF model and processor", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + args = parser.parse_args() + + write_model( + hf_repo_id=args.hf_repo_id, + output_dir=args.output_dir, + safe_serialization=args.safe_serialization, + ) + + write_image_processor( + output_dir=args.output_dir, + ) + + +if __name__ == "__main__": + main() From 9b67f9d2afc1b081a4990149eb16ea906ce09295 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 11:09:50 +0500 Subject: [PATCH 27/72] clean weight conversion script --- .../convert_depth_pro_weights_to_hf.py | 106 +++++++++--------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index fe862d7469a1d3..0b81e8907e299e 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -29,39 +29,55 @@ # fmt: off ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # patch_encoder/image_encoder (ViT based) - r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", - r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", - r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", - r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", - r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", - r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", - - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", - r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + # encoder and head + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + + # fov + r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", + r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", + r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", + r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", + + # upsamples (hard coded; regex is not very feasible here) + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", + "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", + + # projections between encoder and fusion r"decoder.convs.(\d+).weight": lambda match: ( f"projections.{4-int(match.group(1))}.weight" ), - # fov_model.encoder (ViT based) - r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", - r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", - r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", - r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", - r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", - r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", - r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", - r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", - r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", - - # fov head - r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", - r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", - r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", - # fusion stage r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" @@ -72,25 +88,6 @@ r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}" ), - - # qkv attentions blocks - - # upsamples (hard coded; regex is not very feasible here) - "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", - "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", - "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", - "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", - "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", - "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", - "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", - "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", - "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", - "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", - "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", - "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", - "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", - "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", - "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", } # fmt: on @@ -108,9 +105,18 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None): return output_dict def get_qkv_state_dict(key, parameter): + """ + new key which looks like this + xxxx.(q|k|v).xxx (m, n) + + is converted to + xxxx.q.xxxx (m//3, n) + xxxx.k.xxxx (m//3, n) + xxxx.v.xxxx (m//3, n) + """ qkv_state_dict = {} - placeholder = re.search(r'(\(.*?\))', key).group(1) - replacements_keys = placeholder[1:-1].split("|") + placeholder = re.search(r'(\(.*?\))', key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] replacements_vals = torch.split( parameter, split_size_or_sections=parameter.size(0)//len(replacements_keys), From 617c872fb90d313f03fc55962088127e659241c7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 12:57:50 +0500 Subject: [PATCH 28/72] add depth-pro status in other files --- src/transformers/__init__.py | 16 +++++ .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/depth_pro/__init__.py | 72 +++++++++++++++++++ .../convert_depth_pro_weights_to_hf.py | 8 ++- .../depth_pro/image_processing_depth_pro.py | 2 - utils/check_docstrings.py | 1 + utils/check_repo.py | 1 + 9 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 src/transformers/models/depth_pro/__init__.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 47b43e0b90896f..3d0b85e3a1b424 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -408,6 +408,7 @@ "DPRReaderTokenizer", ], "models.dpt": ["DPTConfig"], + "models.depth_pro": ["DepthProConfig"], "models.efficientnet": ["EfficientNetConfig"], "models.electra": [ "ElectraConfig", @@ -1195,6 +1196,7 @@ _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) + _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) @@ -2136,6 +2138,13 @@ "DPTPreTrainedModel", ] ) + _import_structure["models.depth_pro"].extend( + [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + ) _import_structure["models.efficientnet"].extend( [ "EfficientNetForImageClassification", @@ -5272,6 +5281,7 @@ DPRReaderTokenizer, ) from .models.dpt import DPTConfig + from .models.depth_pro import DepthProConfig from .models.efficientnet import ( EfficientNetConfig, ) @@ -6100,6 +6110,7 @@ from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor + from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.efficientnet import EfficientNetImageProcessor from .models.flava import ( FlavaFeatureExtractor, @@ -6907,6 +6918,11 @@ DPTModel, DPTPreTrainedModel, ) + from .models.depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) from .models.efficientnet import ( EfficientNetForImageClassification, EfficientNetModel, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 48625ea3f346cd..d8860d38f85046 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -93,6 +93,7 @@ ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), + ("depth_pro", "DepthProConfig"), ("efficientformer", "EfficientFormerConfig"), ("efficientnet", "EfficientNetConfig"), ("electra", "ElectraConfig"), @@ -394,6 +395,7 @@ ("donut-swin", "DonutSwin"), ("dpr", "DPR"), ("dpt", "DPT"), + ("depth_pro", "DepthPro"), ("efficientformer", "EfficientFormer"), ("efficientnet", "EfficientNet"), ("electra", "ELECTRA"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index a8960d80acc838..e7b53f30a7a064 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -77,6 +77,7 @@ ("dinov2", ("BitImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)), ("dpt", ("DPTImageProcessor",)), + ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor",)), ("efficientnet", ("EfficientNetImageProcessor",)), ("flava", ("FlavaImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 67c539fca66496..4cc15ca4ca51c2 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -92,6 +92,7 @@ ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), + ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("electra", "ElectraModel"), @@ -571,6 +572,7 @@ ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dpt", "DPTModel"), + ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), @@ -866,6 +868,7 @@ # Model for depth estimation mapping ("depth_anything", "DepthAnythingForDepthEstimation"), ("dpt", "DPTForDepthEstimation"), + ("depth_pro", "DepthProForDepthEstimation"), ("glpn", "GLPNForDepthEstimation"), ("zoedepth", "ZoeDepthForDepthEstimation"), ] diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py new file mode 100644 index 00000000000000..1f2a6646c5c07f --- /dev/null +++ b/src/transformers/models/depth_pro/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...file_utils import _LazyModule, is_torch_available, is_vision_available +from ...utils import OptionalDependencyNotAvailable + + +_import_structure = {"configuration_depth_pro": ["DepthProConfig"]} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_depth_pro"] = ["DepthProImageProcessor"] + _import_structure["image_processing_depth_pro_fast"] = ["DepthProImageProcessorFast"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_depth_pro"] = [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_depth_pro import DepthProConfig + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_depth_pro import DepthProImageProcessor + from .image_processing_depth_pro_fast import DepthProImageProcessorFast + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 0b81e8907e299e..741016e88a3d62 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -21,9 +21,11 @@ from huggingface_hub import hf_hub_download from transformers.image_utils import PILImageResampling -from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig -from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast -from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation +from transformers import ( + DepthProConfig, + DepthProImageProcessorFast, + DepthProForDepthEstimation, +) # fmt: off diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 0a7313e2d19a43..99a7c26c98269a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -72,8 +72,6 @@ requires_backends, ) -from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput - if is_torch_available(): import torch diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 0be960f4a33e6d..34deed0df47e01 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -140,6 +140,7 @@ "DPRReaderTokenizer", "DPRReaderTokenizerFast", "DPTModel", + "DepthProModel", "Data2VecAudioConfig", "Data2VecTextConfig", "Data2VecTextModel", diff --git a/utils/check_repo.py b/utils/check_repo.py index 10be5cdcd26230..2e131e8791530e 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -213,6 +213,7 @@ "JukeboxPrior", "SamModel", "DPTForDepthEstimation", + "DepthProForDepthEstimation", "DecisionTransformerGPT2Model", "GLPNForDepthEstimation", "ViltForImagesAndTextClassification", From 6e1c512b15474979ea3176e85214ccc70fcc6cd7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 13:33:25 +0500 Subject: [PATCH 29/72] fill docstring in config --- .../depth_pro/configuration_depth_pro.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index fae3e84432be22..9b53288c41ed08 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -34,8 +34,8 @@ class DepthProConfig(PretrainedConfig): Args: hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. - fusion_hidden_size - TODO + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): @@ -53,15 +53,17 @@ class DepthProConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. - image_size (`int`, *optional*, defaults to 224): - TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size - The size (resolution) of each image. + image_size (`int`, *optional*, defaults to 1536): + The size (resolution) of each image, + To generate depth of same size as image, + image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size + where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. - patch_embeddings_size - TODO + patch_embeddings_size (`int`, *optional*, defaults to 16): + kernel_size and stride for convolution in PatchEmbeddings. qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): @@ -77,21 +79,21 @@ class DepthProConfig(PretrainedConfig): case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. intermediate_hook_ids - TODO + Indices of the intermediate hidden states from patch_encoder to use for fusion. intermediate_feature_dims - TODO + Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids. scaled_images_ratios - TODO + Use images of these ratios for patch_encoder. scaled_images_overlap_ratios - TODO + Overlap ratio between patches for each scaled image in scaled_image_ratios. scaled_images_feature_dims - TODO + Hidden state during upsampling for each scaled image in scaled_images_ratios. use_batch_norm_in_fusion - TODO + Whether to use batch normalization in the residual units of the fusion blocks. use_fov_model - TODO + Whether to use `DepthProFOVModel` to generate Field of View. num_fov_head_layers - TODO + No of convolution layers in head of `DepthProFOVModel`. Example: From 12ee607e5d319a488d7e807a75927cb86f463cec Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 18:47:53 +0500 Subject: [PATCH 30/72] formatting --- .../depth_pro/configuration_depth_pro.py | 2 +- .../convert_depth_pro_weights_to_hf.py | 28 ++++----- .../depth_pro/image_processing_depth_pro.py | 48 +++++++++------ .../image_processing_depth_pro_fast.py | 40 ++++++++----- .../models/depth_pro/modeling_depth_pro.py | 58 ++++++------------- 5 files changed, 88 insertions(+), 88 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 9b53288c41ed08..8bab8227be7ec7 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -67,7 +67,7 @@ class DepthProConfig(PretrainedConfig): qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): - Initial value to use for layer scale. + Initial value to use for layer scale. drop_path_rate (`float`, *optional*, defaults to 0.0): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 741016e88a3d62..c3b77f17f04c69 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -41,7 +41,7 @@ r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", r"head.(\d+).(weight|bias)": r"head.head.\1.\2", # fov @@ -59,19 +59,19 @@ r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", # upsamples (hard coded; regex is not very feasible here) - "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", - "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", - "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", - "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", - "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", - "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", - "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", - "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", - "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", - "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", - "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", - "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", - "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 99a7c26c98269a..0e3c7d6455b07f 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -166,8 +166,8 @@ def resize( resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -260,8 +260,8 @@ def preprocess( `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -352,7 +352,7 @@ def preprocess( to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] - # depth-pro scales the image before resizing it + # depth-pro scales the image before resizing it # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) @@ -363,24 +363,36 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depths, - fovs=None, - target_sizes=None, - ) -> List[Dict[str, TensorType]]: + predicted_depths: Union[TensorType, List[TensorType]], + fovs: Optional[Union[TensorType, List[TensorType], None]] = None, + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: """ - Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. - Only supports PyTorch. + Post-processes the raw depth predictions from the model to generate final depth predictions and optionally + resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) + and adjusts depth values accordingly. Args: - outputs ([`DepthEstimatorOutput`]): - Raw outputs of the model. - target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. + predicted_depths (`Union[TensorType, List[TensorType]]`): + Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each + corresponding to an image in the batch. + fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`): + Field of view (FoV) values corresponding to each depth prediction. Should have the same length + as `predicted_depths` if provided. If `None`, FoV scaling is skipped. + target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. Returns: - `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + `Dict[str, List[TensorType]]`: + A dictionary containing: + - `"predicted_depth"`: A list of processed depth tensors. + - `"fov"`: A list of processed FoV values if provided, otherwise `None`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. """ requires_backends(self, "torch") diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 38d699452e443a..3af05df3ccb886 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -15,7 +15,7 @@ """Fast Image processor class for DepthPro.""" import functools -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Tuple from ...image_processing_base import BatchFeature from ...image_processing_utils import get_size_dict @@ -308,24 +308,36 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depths, - fovs=None, - target_sizes=None, - ) -> List[Dict[str, TensorType]]: + predicted_depths: Union[TensorType, List[TensorType]], + fovs: Optional[Union[TensorType, List[TensorType], None]] = None, + target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: """ - Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. - Only supports PyTorch. + Post-processes the raw depth predictions from the model to generate final depth predictions and optionally + resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) + and adjusts depth values accordingly. Args: - outputs ([`DepthEstimatorOutput`]): - Raw outputs of the model. - target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. + predicted_depths (`Union[TensorType, List[TensorType]]`): + Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each + corresponding to an image in the batch. + fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`): + Field of view (FoV) values corresponding to each depth prediction. Should have the same length + as `predicted_depths` if provided. If `None`, FoV scaling is skipped. + target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. Returns: - `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + `Dict[str, List[TensorType]]`: + A dictionary containing: + - `"predicted_depth"`: A list of processed depth tensors. + - `"fov"`: A list of processed FoV values if provided, otherwise `None`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. """ requires_backends(self, "torch") diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index eb8bf02f83d160..b184b5985ba18c 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -44,6 +44,13 @@ logger = logging.get_logger(__name__) +# General docstring +_CONFIG_FOR_DOC = "DepthProConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "geetu040/DepthPro" +_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024] + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT class DepthProViTPatchEmbeddings(nn.Module): @@ -942,7 +949,7 @@ def forward( # STEP 8: return these features in order of increasing size as what fusion expects last_hidden_state = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) - *scaled_images_features, + *scaled_images_features, # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) *intermediate_features, ] @@ -1049,14 +1056,7 @@ class PreTrainedModel self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # TODO - # @add_code_sample_docstrings( - # checkpoint=_CHECKPOINT_FOR_DOC, - # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, - # config_class=_CONFIG_FOR_DOC, - # modality="vision", - # expected_output=_EXPECTED_OUTPUT_SHAPE, - # ) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -1065,6 +1065,13 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: + r""" + Returns: + + Examples: + TODO + ```python + ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1399,7 +1406,7 @@ def __init__(self, config, use_fov_model=None): @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -1418,37 +1425,6 @@ def forward( Examples: TODO ```python - >>> from transformers import AutoImageProcessor, DPTForDepthEstimation - >>> import torch - >>> import numpy as np - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large") - >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") - - >>> # prepare image for the model - >>> inputs = image_processor(images=image, return_tensors="pt") - - >>> with torch.no_grad(): - ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth - - >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, - ... ) - - >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) ```""" loss = None if labels is not None: From d0a8733f275941adb827a4f7e3850c2a28d66006 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:25:43 +0500 Subject: [PATCH 31/72] more formatting --- .../models/depth_pro/image_processing_depth_pro.py | 7 +++---- src/transformers/models/depth_pro/modeling_depth_pro.py | 7 +------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 0e3c7d6455b07f..21810bfab64573 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -15,14 +15,13 @@ """Image processor class for DepthPro.""" from typing import Dict, List, Optional, Union -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np -from icecream import ic from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import resize, to_channel_dimension_format +from ...image_transforms import to_channel_dimension_format from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, @@ -39,7 +38,7 @@ from ...utils import TensorType, filter_out_non_signature_kwargs, logging import math -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index b184b5985ba18c..3812f678b43fb9 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -14,23 +14,18 @@ # limitations under the License. """PyTorch DepthPro model.""" -from icecream import ic - -import collections.abc import math -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Union import torch from torch import nn from dataclasses import dataclass -from ...utils import ModelOutput from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, DepthEstimatorOutput ) from ...utils import ( - add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging, From e6b385a9edf92a5c7f342935d75ae3e017fe122c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:45:20 +0500 Subject: [PATCH 32/72] formatting with ruff --- .../convert_depth_pro_weights_to_hf.py | 6 +-- .../depth_pro/image_processing_depth_pro.py | 39 ++----------------- .../image_processing_depth_pro_fast.py | 5 ++- .../models/depth_pro/modeling_depth_pro.py | 10 ++--- 4 files changed, 13 insertions(+), 47 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index c3b77f17f04c69..66dfff12065a70 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -19,13 +19,13 @@ import regex as re import torch from huggingface_hub import hf_hub_download -from transformers.image_utils import PILImageResampling from transformers import ( DepthProConfig, - DepthProImageProcessorFast, DepthProForDepthEstimation, + DepthProImageProcessorFast, ) +from transformers.image_utils import PILImageResampling # fmt: off @@ -126,7 +126,7 @@ def get_qkv_state_dict(key, parameter): ) for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict + return qkv_state_dict def write_model( hf_repo_id: str, diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 21810bfab64573..6c9c7f94e2265c 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,12 +14,10 @@ # limitations under the License. """Image processor class for DepthPro.""" -from typing import Dict, List, Optional, Union -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np - from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import to_channel_dimension_format from ...image_utils import ( @@ -30,43 +28,15 @@ PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_list_of_images, - to_numpy_array, - valid_images, - pil_torch_interpolation_mapping, -) -from ...utils import TensorType, filter_out_non_signature_kwargs, logging - -import math -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union - - -if TYPE_CHECKING: - from ...modeling_outputs import DepthEstimatorOutput - -import numpy as np - -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import pad, resize, to_channel_dimension_format -from ...image_utils import ( - IMAGENET_STANDARD_MEAN, - IMAGENET_STANDARD_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_scaled_image, is_torch_available, - is_torch_tensor, make_list_of_images, + pil_torch_interpolation_mapping, to_numpy_array, valid_images, ) from ...utils import ( TensorType, filter_out_non_signature_kwargs, - is_vision_available, logging, requires_backends, ) @@ -75,9 +45,6 @@ if is_torch_available(): import torch -if is_vision_available(): - import PIL - logger = logging.get_logger(__name__) @@ -379,7 +346,7 @@ def post_process_depth_estimation( Field of view (FoV) values corresponding to each depth prediction. Should have the same length as `predicted_depths` if provided. If `None`, FoV scaling is skipped. target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): - Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing is performed. diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 3af05df3ccb886..46b502d7d26f2c 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -15,7 +15,7 @@ """Fast Image processor class for DepthPro.""" import functools -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict, List, Optional, Union from ...image_processing_base import BatchFeature from ...image_processing_utils import get_size_dict @@ -35,6 +35,7 @@ from ...utils import TensorType, logging, requires_backends from ...utils.import_utils import is_torch_available, is_torchvision_available + logger = logging.get_logger(__name__) @@ -325,7 +326,7 @@ def post_process_depth_estimation( Field of view (FoV) values corresponding to each depth prediction. Should have the same length as `predicted_depths` if provided. If `None`, FoV scaling is skipped. target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`): - Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing is performed. diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3812f678b43fb9..5b521cfda9bd3e 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -15,16 +15,16 @@ """PyTorch DepthPro model.""" import math +from dataclasses import dataclass from typing import List, Optional, Set, Tuple, Union import torch from torch import nn -from dataclasses import dataclass from ...activations import ACT2FN -from ...modeling_outputs import ( - BaseModelOutput, DepthEstimatorOutput -) +from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -32,8 +32,6 @@ replace_return_docstrings, torch_int, ) -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from .configuration_depth_pro import DepthProConfig From 267e50fbe2288de71428776adebaea51b902751c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:46:50 +0500 Subject: [PATCH 33/72] formatting with style --- src/transformers/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3d0b85e3a1b424..0e6c48762a853c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -5262,6 +5262,7 @@ XLMProphetNetConfig, ) from .models.depth_anything import DepthAnythingConfig + from .models.depth_pro import DepthProConfig from .models.detr import DetrConfig from .models.dinat import DinatConfig from .models.dinov2 import Dinov2Config @@ -5281,7 +5282,6 @@ DPRReaderTokenizer, ) from .models.dpt import DPTConfig - from .models.depth_pro import DepthProConfig from .models.efficientnet import ( EfficientNetConfig, ) @@ -6107,10 +6107,10 @@ from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor + from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor - from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.efficientnet import EfficientNetImageProcessor from .models.flava import ( FlavaFeatureExtractor, @@ -6872,6 +6872,11 @@ DepthAnythingForDepthEstimation, DepthAnythingPreTrainedModel, ) + from .models.depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) from .models.detr import ( DetrForObjectDetection, DetrForSegmentation, @@ -6918,11 +6923,6 @@ DPTModel, DPTPreTrainedModel, ) - from .models.depth_pro import ( - DepthProForDepthEstimation, - DepthProModel, - DepthProPreTrainedModel, - ) from .models.efficientnet import ( EfficientNetForImageClassification, EfficientNetModel, From a1ec99743563ae054ae159a7d83dc76e9c09a4ab Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 28 Nov 2024 00:48:06 +0500 Subject: [PATCH 34/72] fix copied classes --- .../depth_pro/configuration_depth_pro.py | 48 ++-- .../convert_depth_pro_weights_to_hf.py | 44 ++-- .../depth_pro/image_processing_depth_pro.py | 9 +- .../image_processing_depth_pro_fast.py | 9 +- .../models/depth_pro/modeling_depth_pro.py | 225 ++++++++++-------- 5 files changed, 174 insertions(+), 161 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 8bab8227be7ec7..d938f0a721f1ae 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -78,22 +78,22 @@ class DepthProConfig(PretrainedConfig): Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. - intermediate_hook_ids - Indices of the intermediate hidden states from patch_encoder to use for fusion. - intermediate_feature_dims - Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids. - scaled_images_ratios - Use images of these ratios for patch_encoder. - scaled_images_overlap_ratios - Overlap ratio between patches for each scaled image in scaled_image_ratios. - scaled_images_feature_dims - Hidden state during upsampling for each scaled image in scaled_images_ratios. - use_batch_norm_in_fusion - Whether to use batch normalization in the residual units of the fusion blocks. - use_fov_model - Whether to use `DepthProFOVModel` to generate Field of View. - num_fov_head_layers - No of convolution layers in head of `DepthProFOVModel`. + intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`): + Indices of the intermediate hidden states from the patch encoder to use for fusion. + intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`): + Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`. + scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`): + Ratios of scaled images to be used by the patch encoder. + scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`): + Overlap ratios between patches for each scaled image in `scaled_images_ratios`. + scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`): + Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_fov_model (`bool`, *optional*, defaults to `True`): + Whether to use `DepthProFOVModel` to generate the field of view. + num_fov_head_layers (`int`, *optional*, defaults to `2`): + Number of convolution layers in the head of `DepthProFOVModel`. Example: @@ -134,12 +134,13 @@ def __init__( use_swiglu_ffn=False, apply_layernorm=True, reshape_hidden_states=True, - intermediate_hook_ids = [11, 5], - intermediate_feature_dims = [256, 256], - scaled_images_ratios = [0.25, 0.5, 1], - scaled_images_overlap_ratios = [0.0, 0.5, 0.25], - scaled_images_feature_dims = [1024, 1024, 512], - use_batch_norm_in_fusion=False, + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, use_fov_model=True, num_fov_head_layers=2, **kwargs, @@ -166,7 +167,8 @@ def __init__( self.use_swiglu_ffn = use_swiglu_ffn self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.use_batch_norm_in_fusion = use_batch_norm_in_fusion + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual self.use_fov_model = use_fov_model self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 66dfff12065a70..377595b746aca5 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -93,6 +93,7 @@ } # fmt: on + def convert_old_keys_to_new_keys(state_dict_keys: dict = None): output_dict = {} if state_dict_keys is not None: @@ -106,6 +107,7 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None): output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) return output_dict + def get_qkv_state_dict(key, parameter): """ new key which looks like this @@ -117,21 +119,20 @@ def get_qkv_state_dict(key, parameter): xxxx.v.xxxx (m//3, n) """ qkv_state_dict = {} - placeholder = re.search(r'(\(.*?\))', key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] + placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] replacements_vals = torch.split( - parameter, - split_size_or_sections=parameter.size(0)//len(replacements_keys), - dim=0 + parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 ) for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val return qkv_state_dict + def write_model( hf_repo_id: str, output_dir: str, - safe_serialization: bool=True, + safe_serialization: bool = True, ): os.makedirs(output_dir, exist_ok=True) @@ -162,11 +163,11 @@ def write_model( use_swiglu_ffn=False, apply_layernorm=True, reshape_hidden_states=True, - intermediate_hook_ids = [11, 5], - intermediate_feature_dims = [256, 256], - scaled_images_ratios = [0.25, 0.5, 1], - scaled_images_overlap_ratios = [0.0, 0.5, 0.25], - scaled_images_feature_dims = [1024, 1024, 512], + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], use_batch_norm_in_fusion=False, use_fov_model=True, num_fov_head_layers=2, @@ -215,18 +216,19 @@ def write_model( DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") print("Model reloaded successfully.") + def write_image_processor(output_dir: str): image_processor = DepthProImageProcessorFast( - do_resize = True, - size = {"height": 1536, "width": 1536}, - resample = PILImageResampling.BILINEAR, - antialias = False, - do_rescale = True, - rescale_factor = 1 / 255, - do_normalize = True, - image_mean = 0.5, - image_std = 0.5, - return_tensors = "pt", + do_resize=True, + size={"height": 1536, "width": 1536}, + resample=PILImageResampling.BILINEAR, + antialias=False, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=0.5, + image_std=0.5, + return_tensors="pt", ) image_processor.save_pretrained(output_dir) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 6c9c7f94e2265c..15a33f804d145a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -371,18 +371,13 @@ def post_process_depth_estimation( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - outputs = { - "predicted_depth": [], - "fov": [] if fovs is not None else None - } + outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None} fovs = [None] * len(predicted_depths) if fovs is None else fovs target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - if target_size is not None: - # scale image w.r.t fov if fov is not None: width = target_size[1] @@ -395,7 +390,7 @@ def post_process_depth_estimation( predicted_depth.unsqueeze(0).unsqueeze(1), size=target_size, resample=self.resample, - antialias=self.antialias + antialias=self.antialias, ).squeeze() # inverse the depth diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 46b502d7d26f2c..374d5c25cafc9e 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -161,7 +161,7 @@ def _build_transforms( Resize( (size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample], - antialias=antialias + antialias=antialias, ) ) @@ -351,18 +351,13 @@ def post_process_depth_estimation( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - outputs = { - "predicted_depth": [], - "fov": [] if fovs is not None else None - } + outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None} fovs = [None] * len(predicted_depths) if fovs is None else fovs target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - if target_size is not None: - # scale image w.r.t fov if fov is not None: width = target_size[1] diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 5b521cfda9bd3e..77983933a19add 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -40,17 +40,11 @@ # General docstring _CONFIG_FOR_DOC = "DepthProConfig" -# Base docstring -_CHECKPOINT_FOR_DOC = "geetu040/DepthPro" -_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024] - -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT class DepthProViTPatchEmbeddings(nn.Module): """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. + Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings + with addition of config parameter patch_embeddings_size """ def __init__(self, config): @@ -60,6 +54,7 @@ def __init__(self, config): self.in_channels = config.num_channels self.out_channels = config.hidden_size self.patch_embeddings_size = config.patch_embeddings_size + self.num_channels = config.num_channels self.projection = nn.Conv2d( self.in_channels, @@ -68,9 +63,10 @@ def __init__(self, config): stride=(self.patch_embeddings_size, self.patch_embeddings_size), ) + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.config.num_channels: + if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -79,11 +75,10 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings -# with DepthProViT->DepthProViT and antialias=True in interpolation class DepthProViTEmbeddings(nn.Module): """ - Construct the CLS token, position and patch embeddings. + Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings + except antialias=True in interpolation and removal of mask_token """ def __init__(self, config: DepthProConfig) -> None: @@ -131,7 +126,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: size=(new_height, new_width), mode="bicubic", align_corners=False, - antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings + antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProViTPatchEmbeddings ).to(dtype=target_dtype) patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) @@ -155,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro class DepthProViTSelfAttention(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -216,7 +211,7 @@ def forward( return outputs -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): def __init__(self, config: DepthProConfig) -> None: super().__init__(config) @@ -226,8 +221,9 @@ def forward( self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + "DepthProViTModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( @@ -257,7 +253,7 @@ def forward( return context_layer, None -# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTSelfOutput(nn.Module): """ The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the @@ -276,7 +272,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTAttention(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -316,14 +312,14 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTSdpaAttention(DepthProViTAttention): def __init__(self, config: DepthProConfig) -> None: super().__init__(config) self.attention = DepthProViTSdpaSelfAttention(config) -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2LayerScale with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTLayerScale(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -369,7 +365,7 @@ def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthPro class DepthProViTMLP(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -389,7 +385,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthPro class DepthProViTSwiGLUFFN(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -413,7 +409,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: } -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing class DepthProViTLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" @@ -465,7 +461,7 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -569,14 +565,14 @@ def forward( class DepthProUpsampleBlock(nn.Module): def __init__( - self, - input_dims, - intermediate_dims, - output_dims, - n_upsample_layers, - use_proj=True, - bias=False, - ) -> None: + self, + input_dims, + intermediate_dims, + output_dims, + n_upsample_layers, + use_proj=True, + bias=False, + ) -> None: super().__init__() # create first projection block @@ -620,6 +616,7 @@ def interpolate(pixel_values, scale_factor): align_corners=False, ) + def patch(pixel_values, patch_size, overlap_ratio): """Creates Patches from Batch.""" B, C, W, H = pixel_values.shape @@ -631,9 +628,7 @@ def patch(pixel_values, patch_size, overlap_ratio): stride = int(patch_size * (1 - overlap_ratio)) # (B, C, W, H) - patches = torch.nn.functional.unfold( - pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) - ) + patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)) # patches.shape (B, patch_size**2 * C, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, patch_size**2 * C) @@ -642,11 +637,12 @@ def patch(pixel_values, patch_size, overlap_ratio): return patches + def reshape_feature(hidden_states, width, height): """Discard class token and reshape 1D feature map to a 2D grid.""" B, _, C = hidden_states.shape # (B, WH+1, C) - hidden_states = hidden_states[:, 1:, :] # remove class token + hidden_states = hidden_states[:, 1:, :] # remove class token # (B, WH, C) hidden_states = hidden_states.reshape(B, width, height, C) # (B, W, H, C) @@ -654,6 +650,7 @@ def reshape_feature(hidden_states, width, height): # (B, C, W, H) return hidden_states + def merge(patches, batch_size, merge_out_size): """Recreates Batch from Patches.""" num_patches, num_channels, out_size, out_size = patches.shape @@ -668,7 +665,7 @@ def merge(patches, batch_size, merge_out_size): merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) """ - padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 ) + padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2) i = 0 boxes = [] @@ -685,10 +682,10 @@ def merge(patches, batch_size, merge_out_size): box = box[..., :, padding:] if h != box_size - 1: # remove pad from height if box is not at bottom border - box = box[..., :box.shape[-2]-padding, :] + box = box[..., : box.shape[-2] - padding, :] if w != box_size - 1: # remove pad from width if box is not at right border - box = box[..., :, :box.shape[-1]-padding] + box = box[..., :, : box.shape[-1] - padding] boxes_in_row.append(box) i += 1 @@ -717,13 +714,12 @@ def __init__(self, config: DepthProConfig) -> None: self.n_scaled_images = len(self.scaled_images_ratios) self.n_intermediate_hooks = len(self.intermediate_hook_ids) self.out_size = config.patch_size // config.patch_embeddings_size - self.seq_len = self.out_size ** 2 # each patch is flattened + self.seq_len = self.out_size**2 # each patch is flattened # config.scaled_images_ratios is sorted if config.scaled_images_ratios != sorted(config.scaled_images_ratios): raise ValueError( - f"Values in scaled_images_ratios={config.scaled_images_ratios} " - "should be sorted from low to high" + f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high" ) # lowest image resolution is greator than the patch_size @@ -767,7 +763,7 @@ def __init__(self, config: DepthProConfig) -> None: input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=2+i, + n_upsample_layers=2 + i, ) self.upsample_intermediate.append(upsample_block) @@ -783,7 +779,7 @@ def __init__(self, config: DepthProConfig) -> None: # for STEP 7: fuse low_res and image features self.fuse_image_with_low_res = nn.Conv2d( - in_channels=config.scaled_images_feature_dims[0]*2, + in_channels=config.scaled_images_feature_dims[0] * 2, out_channels=config.scaled_images_feature_dims[0], kernel_size=1, stride=1, @@ -838,7 +834,7 @@ def forward( overlap_ratio=self.scaled_images_overlap_ratios[i], ) scaled_images_num_patches = [len(i) for i in scaled_images] - patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -847,16 +843,15 @@ def forward( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=True, # required for intermediate features + output_hidden_states=True, # required for intermediate features return_dict=True, ) scaled_images_last_hidden_state = torch.split_with_sizes( - patch_encodings.last_hidden_state, - scaled_images_num_patches[::-1] - )[::-1] # -1 as patch encoder expects high res patches first + patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] + )[::-1] # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -874,12 +869,12 @@ def forward( # b. reshape back to image like features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) + ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) # c. merge patches back together features = merge( - features, batch_size=B, merge_out_size=self.out_size*2**i - ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) + features, batch_size=B, merge_out_size=self.out_size * 2**i + ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) # d. upsample features = self.upsample_scaled_images[i](features) @@ -891,11 +886,14 @@ def forward( intermediate_features = [] for i in range(self.n_intermediate_hooks): - # a. extract hidden_state - layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well + layer_id = ( + self.intermediate_hook_ids[i] + 1 + ) # +1 to correct index position as hidden_states contain embedding output as well hidden_state = patch_encodings.hidden_states[layer_id] - hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution + hidden_state = hidden_state[ + : scaled_images_num_patches[-1] + ] # num_patches to be of same length as highest resolution # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like @@ -903,12 +901,14 @@ def forward( hidden_state, self.out_size, self.out_size, - ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = merge( - features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample features = self.upsample_intermediate[i](features) @@ -919,20 +919,26 @@ def forward( # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + hidden_state = ( + image_encodings.last_hidden_state + ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together image_features = merge( - image_features, batch_size=B, merge_out_size=self.out_size*2**(0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + image_features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) + image_features = self.upsample_image( + image_features + ) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0]) # fuses image_features with lowest resolution features as they are of same size @@ -1089,37 +1095,49 @@ def forward( return encodings -# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro -class DepthProResidualLayer(nn.Module): +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro +class DepthProPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthProConfig]`): + Model configuration class defining the model architecture. + """ + def __init__(self, config): super().__init__() - self.use_batch_norm = config.use_batch_norm_in_fusion - self.hidden_size = config.fusion_hidden_size + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) self.activation1 = nn.ReLU() self.convolution1 = nn.Conv2d( - self.hidden_size, - self.hidden_size, + config.fusion_hidden_size, + config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=(not self.use_batch_norm), + bias=use_bias_in_fusion_residual, ) self.activation2 = nn.ReLU() self.convolution2 = nn.Conv2d( - self.hidden_size, - self.hidden_size, + config.fusion_hidden_size, + config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=(not self.use_batch_norm), + bias=use_bias_in_fusion_residual, ) if self.use_batch_norm: - self.batch_norm1 = nn.BatchNorm2d(self.hidden_size) - self.batch_norm2 = nn.BatchNorm2d(self.hidden_size) + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: residual = hidden_state @@ -1139,15 +1157,16 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state + residual -# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs) class DepthProFeatureFusionLayer(nn.Module): - def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: + def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: super().__init__() self.config = config self.use_deconv = use_deconv - self.residual_layer1 = DepthProResidualLayer(config) - self.residual_layer2 = DepthProResidualLayer(config) + self.residual_layer1 = DepthProPreActResidualLayer(config) + self.residual_layer2 = DepthProPreActResidualLayer(config) if self.use_deconv: self.deconv = nn.ConvTranspose2d( @@ -1174,13 +1193,14 @@ def forward(self, hidden_state, residual=None): return hidden_state -# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters +# Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro +# with extra layer parameters, deconv and reversed layers class DepthProFeatureFusionStage(nn.Module): def __init__(self, config, num_layers): super().__init__() self.num_layers = num_layers self.layers = nn.ModuleList() - for _ in range(self.num_layers-1): + for _ in range(self.num_layers - 1): self.layers.append(DepthProFeatureFusionLayer(config)) # final layer doesnot require deconvolution self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False)) @@ -1214,7 +1234,7 @@ def __init__(self, config: DepthProConfig) -> None: self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2) self.global_neck = nn.Sequential( nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True) + nn.ReLU(True), ) if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0: @@ -1227,19 +1247,21 @@ def __init__(self, config: DepthProConfig) -> None: self.head = nn.Sequential() for i in range(config.num_fov_head_layers): self.head.append( - nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + nn.Conv2d( + self.fusion_hidden_size // 2 ** (i + 1), + self.fusion_hidden_size // 2 ** (i + 2), + kernel_size=3, + stride=2, + padding=1, + ) ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1) + final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( - in_channels=final_in_channels, - out_channels=1, - kernel_size=final_kernal_size, - stride=1, - padding=0 + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0 ) ) @@ -1263,7 +1285,7 @@ def forward( # follow the steps same as with image features in DepthProEncoder pixel_values = interpolate( pixel_values, - scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image + scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image ) patches = patch( pixel_values, @@ -1279,11 +1301,7 @@ def forward( ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.encoder_neck(last_hidden_state) - last_hidden_state = reshape_feature( - last_hidden_state, - width=self.out_size, - height=self.out_size - ) + last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) last_hidden_state = merge( last_hidden_state, batch_size=B, @@ -1321,12 +1339,11 @@ def __init__(self, config): features = config.fusion_hidden_size self.head = nn.Sequential( - nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( - in_channels=features//2, out_channels=features//2, - kernel_size=2, stride=2, padding=0, bias=True + in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True ), - nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(), @@ -1347,6 +1364,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ + fov: Optional[torch.FloatTensor] = None @@ -1369,7 +1387,7 @@ def __init__(self, config, use_fov_model=None): combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims self.projections = nn.ModuleList() for i, in_channels in enumerate(combined_feature_dims): - if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size: + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: # projection for last layer can be ignored if input and output channels already match self.projections.append(nn.Identity()) else: @@ -1397,7 +1415,6 @@ def __init__(self, config, use_fov_model=None): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1454,7 +1471,9 @@ def forward( ) fov = fov_encodings.last_hidden_state attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + hidden_states = ( + depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + ) else: fov = None attentions = depth_pro_outputs.attentions From 3c656f24a5e33fed84663f2c0d45053b2b3c4e91 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 28 Nov 2024 01:29:54 +0500 Subject: [PATCH 35/72] add examples; update weight convert script --- .../convert_depth_pro_weights_to_hf.py | 4 +- .../models/depth_pro/modeling_depth_pro.py | 58 ++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 377595b746aca5..cd06a99c5fb2b4 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -168,7 +168,8 @@ def write_model( scaled_images_ratios=[0.25, 0.5, 1], scaled_images_overlap_ratios=[0.0, 0.5, 0.25], scaled_images_feature_dims=[1024, 1024, 512], - use_batch_norm_in_fusion=False, + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, use_fov_model=True, num_fov_head_layers=2, ) @@ -228,7 +229,6 @@ def write_image_processor(output_dir: str): do_normalize=True, image_mean=0.5, image_std=0.5, - return_tensors="pt", ) image_processor.save_pretrained(output_dir) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 77983933a19add..255174de09934b 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1068,8 +1068,34 @@ def forward( Returns: Examples: - TODO + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, DepthProModel + >>> + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> + >>> checkpoint = "geetu040/DepthPro" + >>> processor = AutoProcessor.from_pretrained(checkpoint) + >>> model = DepthProModel.from_pretrained(checkpoint) + >>> + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + >>> + >>> with torch.no_grad(): + ... output = model(**inputs) + ... + >>> for state in output.last_hidden_state: + ... print(state.shape) + ... + torch.Size([1, 1024, 48, 48]) + torch.Size([1, 1024, 96, 96]) + torch.Size([1, 512, 192, 192]) + torch.Size([1, 256, 384, 384]) + torch.Size([1, 256, 768, 768]) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1433,8 +1459,36 @@ def forward( Returns: Examples: - TODO + ```python + >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation + >>> import torch + >>> from PIL import Image + >>> import requests + >>> + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> + >>> checkpoint = "geetu040/DepthPro" + >>> processor = AutoImageProcessor.from_pretrained(checkpoint) + >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + >>> + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + >>> + >>> with torch.no_grad(): + ... outputs = model(**inputs) + ... + >>> # interpolate to original size + >>> post_processed_output = processor.post_process_depth_estimation( + ... outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)], + ... ) + >>> + >>> # visualize the prediction + >>> predicted_depth = post_processed_output["predicted_depth"][0] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" loss = None if labels is not None: From f6f6d3d130b97519b8f9bf0ae9413301f655ecd9 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 10:08:56 +0500 Subject: [PATCH 36/72] fix using check_table.py and isort --- docs/source/en/index.md | 1 + src/transformers/__init__.py | 18 ++++++++-------- .../models/auto/configuration_auto.py | 4 ++-- .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 6 +++--- .../models/gemma/configuration_gemma.py | 1 - src/transformers/utils/dummy_pt_objects.py | 21 +++++++++++++++++++ .../utils/dummy_vision_objects.py | 14 +++++++++++++ 8 files changed, 51 insertions(+), 16 deletions(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index aaff45ab65dfb6..d316e89ce6f45d 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -117,6 +117,7 @@ Flax), PyTorch, and/or TensorFlow. | [DeiT](model_doc/deit) | ✅ | ✅ | ❌ | | [DePlot](model_doc/deplot) | ✅ | ❌ | ❌ | | [Depth Anything](model_doc/depth_anything) | ✅ | ❌ | ❌ | +| [DepthPro](model_doc/depth_pro) | ✅ | ❌ | ❌ | | [DETA](model_doc/deta) | ✅ | ❌ | ❌ | | [DETR](model_doc/detr) | ✅ | ❌ | ❌ | | [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ | diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0e6c48762a853c..d4ac4b5fd866fa 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -387,6 +387,7 @@ "models.deprecated.vit_hybrid": ["ViTHybridConfig"], "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"], "models.depth_anything": ["DepthAnythingConfig"], + "models.depth_pro": ["DepthProConfig"], "models.detr": ["DetrConfig"], "models.dialogpt": [], "models.dinat": ["DinatConfig"], @@ -408,7 +409,6 @@ "DPRReaderTokenizer", ], "models.dpt": ["DPTConfig"], - "models.depth_pro": ["DepthProConfig"], "models.efficientnet": ["EfficientNetConfig"], "models.electra": [ "ElectraConfig", @@ -1193,10 +1193,10 @@ _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) + _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) - _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) @@ -2078,6 +2078,13 @@ "DepthAnythingPreTrainedModel", ] ) + _import_structure["models.depth_pro"].extend( + [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + ) _import_structure["models.detr"].extend( [ "DetrForObjectDetection", @@ -2138,13 +2145,6 @@ "DPTPreTrainedModel", ] ) - _import_structure["models.depth_pro"].extend( - [ - "DepthProForDepthEstimation", - "DepthProModel", - "DepthProPreTrainedModel", - ] - ) _import_structure["models.efficientnet"].extend( [ "EfficientNetForImageClassification", diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index d8860d38f85046..a02af514b65aa1 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -85,6 +85,7 @@ ("deformable_detr", "DeformableDetrConfig"), ("deit", "DeiTConfig"), ("depth_anything", "DepthAnythingConfig"), + ("depth_pro", "DepthProConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), ("dinat", "DinatConfig"), @@ -93,7 +94,6 @@ ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), - ("depth_pro", "DepthProConfig"), ("efficientformer", "EfficientFormerConfig"), ("efficientnet", "EfficientNetConfig"), ("electra", "ElectraConfig"), @@ -385,6 +385,7 @@ ("deplot", "DePlot"), ("depth_anything", "Depth Anything"), ("depth_anything_v2", "Depth Anything V2"), + ("depth_pro", "DepthPro"), ("deta", "DETA"), ("detr", "DETR"), ("dialogpt", "DialoGPT"), @@ -395,7 +396,6 @@ ("donut-swin", "DonutSwin"), ("dpr", "DPR"), ("dpt", "DPT"), - ("depth_pro", "DepthPro"), ("efficientformer", "EfficientFormer"), ("efficientnet", "EfficientNet"), ("electra", "ELECTRA"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index e7b53f30a7a064..3887f29415b052 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -71,13 +71,13 @@ ("deformable_detr", ("DeformableDetrImageProcessor",)), ("deit", ("DeiTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)), + ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("deta", ("DetaImageProcessor",)), ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinov2", ("BitImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)), ("dpt", ("DPTImageProcessor",)), - ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor",)), ("efficientnet", ("EfficientNetImageProcessor",)), ("flava", ("FlavaImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 4cc15ca4ca51c2..b8bcd0cbcb00a9 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -84,6 +84,7 @@ ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), ("dinat", "DinatModel"), @@ -92,7 +93,6 @@ ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), - ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("electra", "ElectraModel"), @@ -567,12 +567,12 @@ ("data2vec-vision", "Data2VecVisionModel"), ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dpt", "DPTModel"), - ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), @@ -867,8 +867,8 @@ [ # Model for depth estimation mapping ("depth_anything", "DepthAnythingForDepthEstimation"), - ("dpt", "DPTForDepthEstimation"), ("depth_pro", "DepthProForDepthEstimation"), + ("dpt", "DPTForDepthEstimation"), ("glpn", "GLPNForDepthEstimation"), ("zoedepth", "ZoeDepthForDepthEstimation"), ] diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e170803cccab70..346f386ba698f2 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,7 +20,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 36e1ff2cfe65c4..dc32f6d653d635 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3457,6 +3457,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class DepthProForDepthEstimation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DepthProModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DepthProPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class DetrForObjectDetection(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 19cf02a4e85826..1ceb9e227bb2d9 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -177,6 +177,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class DepthProImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + +class DepthProImageProcessorFast(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class DetrFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] From b4575d026de8a8ca69650c76ab3b21f22e860a48 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 10:45:19 +0500 Subject: [PATCH 37/72] fix config docstring --- .../models/depth_pro/configuration_depth_pro.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d938f0a721f1ae..d48d68b832b472 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -38,7 +38,7 @@ class DepthProConfig(PretrainedConfig): The number of channels before fusion. num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): + num_attention_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. mlp_ratio (`int`, *optional*, defaults to 4): Ratio of the hidden size of the MLPs relative to the `hidden_size`. @@ -58,7 +58,7 @@ class DepthProConfig(PretrainedConfig): To generate depth of same size as image, image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - patch_size (`int`, *optional*, defaults to 14): + patch_size (`int`, *optional*, defaults to 384): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. @@ -90,9 +90,11 @@ class DepthProConfig(PretrainedConfig): Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. use_fov_model (`bool`, *optional*, defaults to `True`): Whether to use `DepthProFOVModel` to generate the field of view. - num_fov_head_layers (`int`, *optional*, defaults to `2`): + num_fov_head_layers (`int`, *optional*, defaults to 2): Number of convolution layers in the head of `DepthProFOVModel`. Example: From c8d8a9e0ca3750cc062fe9ad3b90fdbe5a893f0b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 11:26:12 +0500 Subject: [PATCH 38/72] add depth pro to sdpa docs --- docs/source/en/perf_infer_gpu_one.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 67bd31fdaeede5..4f1ccc9c427c37 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -227,6 +227,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel) +* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel) * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) From 77873de8a34447d64d16e1a5def4ba8fb7109bb5 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Fri, 29 Nov 2024 15:30:42 +0500 Subject: [PATCH 39/72] undo unintentional changes in configuration_gemma.py --- src/transformers/models/gemma/configuration_gemma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 346f386ba698f2..e170803cccab70 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,6 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from ...configuration_utils import PretrainedConfig From 5f2378d112193317902a733d13b21fc081fc8b56 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 30 Nov 2024 23:51:55 +0500 Subject: [PATCH 40/72] minor fixes --- src/transformers/models/__init__.py | 1 + .../depth_pro/image_processing_depth_pro.py | 24 +++++++++++-------- .../models/depth_pro/modeling_depth_pro.py | 7 +----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 9155f629e63f91..fc26362dd64dc4 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -69,6 +69,7 @@ deit, deprecated, depth_anything, + depth_pro, detr, dialogpt, dinat, diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 15a33f804d145a..746f246fcd73a9 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -113,7 +113,7 @@ def __init__( def resize( self, - images: List[np.ndarray], + image: np.ndarray, size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, antialias: bool = False, @@ -125,8 +125,8 @@ def resize( Resize an image to `(size["height"], size["width"])`. Args: - images (`List[np.ndarray]`): - Images to resize. + image (`np.ndarray`): + Image to resize. size (`Dict[str, int]`): Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): @@ -157,16 +157,13 @@ def resize( raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - images = np.stack(images) - images = torch.from_numpy(images) - return torch.nn.functional.interpolate( # input should be (B, C, H, W) - input=images, + input=torch.from_numpy(image).unsqueeze(0), size=output_size, mode=pil_torch_interpolation_mapping[resample].value, antialias=antialias, - ) + ).squeeze(0).numpy() def _validate_input_arguments( self, @@ -321,8 +318,15 @@ def preprocess( # depth-pro scales the image before resizing it # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: - images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) - images = images.numpy() + images = [ + self.resize( + image=image, + size=size, + resample=resample, + antialias=antialias, + ) + for image in images + ] data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 255174de09934b..16601f9c7c8621 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -660,7 +660,7 @@ def merge(patches, batch_size, merge_out_size): # patches are not created when scaled image size is equal to patch size return patches - box_size = int(math.sqrt(num_patches // batch_size)) + box_size = math.ceil(math.sqrt(num_patches // batch_size)) """ merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) @@ -806,11 +806,6 @@ def forward( B, C, H, W = pixel_values.shape - if not (H == W == self.config.image_size): - raise ValueError( - f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config." - ) - if not (C == self.config.num_channels): raise ValueError( f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config." From d51d0b198824370c47650ca6cc49f403e9c752cc Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 30 Nov 2024 23:57:26 +0500 Subject: [PATCH 41/72] test image processing --- .../test_image_processing_depth_pro.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tests/models/depth_pro/test_image_processing_depth_pro.py diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py new file mode 100644 index 00000000000000..eea9ed01378db9 --- /dev/null +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.file_utils import is_vision_available +from transformers.testing_utils import require_torch, require_vision + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import DepthProImageProcessor, DepthProImageProcessorFast + + +class DepthProImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + super().__init__() + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DepthProImageProcessor if is_vision_available() else None + fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DepthProImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "antialias")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) From 082b05555df1b7b55335d6790582f47b0e6c4ca1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 02:01:42 +0500 Subject: [PATCH 42/72] fixes and tests --- docs/source/en/model_doc/depth_pro.md | 119 +++++++ .../depth_pro/configuration_depth_pro.py | 2 +- .../depth_pro/image_processing_depth_pro.py | 1 - .../models/depth_pro/modeling_depth_pro.py | 177 +++++---- tests/models/depth_pro/__init__.py | 0 .../depth_pro/test_modeling_depth_pro.py | 335 ++++++++++++++++++ 6 files changed, 558 insertions(+), 76 deletions(-) create mode 100644 docs/source/en/model_doc/depth_pro.md create mode 100644 tests/models/depth_pro/__init__.py create mode 100644 tests/models/depth_pro/test_modeling_depth_pro.py diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md new file mode 100644 index 00000000000000..6472cc506dae72 --- /dev/null +++ b/docs/source/en/model_doc/depth_pro.md @@ -0,0 +1,119 @@ + + +# DepthPro + +## Overview + +The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun. + +It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt) like decoder. + +The abstract from the paper is the following: + +*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* + + + + DepthPro architecture. Taken from the original paper. + +This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro). + + + +## Usage tips + +```python +from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation + +# initialize with a Transformer-based backbone such as DINOv2 +# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width) +backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False) + +config = DepthProConfig(backbone_config=backbone_config) +model = DepthProForDepthEstimation(config=config) +``` + +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import ViTForImageClassification +model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference. + +| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) | +|--------------|-------------------------------------------|-------------------------------------------|------------------------------| +| 1 | 7 | 6 | 1.17 | +| 2 | 8 | 6 | 1.33 | +| 4 | 8 | 6 | 1.33 | +| 8 | 8 | 6 | 1.33 | + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. + +- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro). + +- [Semantic segmentation task guide](../tasks/semantic_segmentation) +- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## DepthProConfig + +[[autodoc]] DepthProConfig + +## DepthProFeatureExtractor + +[[autodoc]] DepthProFeatureExtractor + - __call__ + - post_process_semantic_segmentation + +## DepthProImageProcessor + +[[autodoc]] DepthProImageProcessor + - preprocess + - post_process_semantic_segmentation + +## DepthProModel + +[[autodoc]] DepthProModel + - forward + +## DepthProForDepthEstimation + +[[autodoc]] DepthProForDepthEstimation + - forward + +## DepthProForSemanticSegmentation + +[[autodoc]] DepthProForSemanticSegmentation + - forward diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d48d68b832b472..beb3215d8ddf8d 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -56,7 +56,7 @@ class DepthProConfig(PretrainedConfig): image_size (`int`, *optional*, defaults to 1536): The size (resolution) of each image, To generate depth of same size as image, - image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size + image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) patch_size (`int`, *optional*, defaults to 384): The size (resolution) of each patch. diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 746f246fcd73a9..65a29900c63744 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -264,7 +264,6 @@ def preprocess( image_std = image_std if image_std is not None else self.image_std size = size if size is not None else self.size - size_dict = get_size_dict(size) images = make_list_of_images(images) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 16601f9c7c8621..2e074588d4e301 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -31,6 +31,7 @@ logging, replace_return_docstrings, torch_int, + ModelOutput, ) from .configuration_depth_pro import DepthProConfig @@ -87,9 +88,9 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2 - self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) - self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) + self.position_embeddings = nn.Parameter(torch.zeros(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: @@ -698,6 +699,35 @@ def merge(patches, batch_size, merge_out_size): return boxes +@dataclass +class DepthProOutput(ModelOutput): + """ + Base class for DepthPro's outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + features (`List[torch.FloatTensor]`, *optional*: + Features from scaled images and hidden_states. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor = None + features: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -794,7 +824,7 @@ def forward( output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: + ) -> Union[tuple, DepthProOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -848,8 +878,8 @@ def forward( image_encodings = self.image_encoder( pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, return_dict=True, ) @@ -941,21 +971,36 @@ def forward( scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) # STEP 8: return these features in order of increasing size as what fusion expects - last_hidden_state = [ + features = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) *scaled_images_features, # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) *intermediate_features, ] - hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None - attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None + # prepare last_hidden_state, hidden_states, attentions from patches to batches + + last_hidden_state = patch_encodings.last_hidden_state + hidden_states = patch_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions if output_attentions else None + + num_patches = sum(scaled_images_num_patches) + # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 + indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T + indexes = indexes.to(last_hidden_state.device) + + last_hidden_state = last_hidden_state[indexes].mean(1) + if hidden_states is not None: + hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) + if attentions is not None: + attentions = tuple([state[indexes].mean(1) for state in attentions]) if not return_dict: - return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) - return BaseModelOutput( + return DepthProOutput( last_hidden_state=last_hidden_state, + features=features, hidden_states=hidden_states, attentions=attentions, ) @@ -1034,11 +1079,7 @@ def __init__(self, config): self.post_init() def get_input_embeddings(self): - embeddings = { - "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, - "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, - } - return embeddings + return self.encoder.patch_encoder.embeddings.patch_embeddings def _prune_heads(self, heads_to_prune): """ @@ -1058,7 +1099,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[Tuple, DepthProOutput]: r""" Returns: @@ -1215,7 +1256,7 @@ def forward(self, hidden_state, residual=None): # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro -# with extra layer parameters, deconv and reversed layers +# with num_layers, deconv and reversed layers class DepthProFeatureFusionStage(nn.Module): def __init__(self, config, num_layers): super().__init__() @@ -1269,8 +1310,8 @@ def __init__(self, config: DepthProConfig) -> None: for i in range(config.num_fov_head_layers): self.head.append( nn.Conv2d( - self.fusion_hidden_size // 2 ** (i + 1), - self.fusion_hidden_size // 2 ** (i + 2), + math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), + math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), kernel_size=3, stride=2, padding=1, @@ -1278,7 +1319,7 @@ def __init__(self, config: DepthProConfig) -> None: ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1) + final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( @@ -1291,16 +1332,7 @@ def forward( pixel_values: torch.Tensor, global_features: torch.Tensor, head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + ) -> torch.Tensor: B, C, W, H = pixel_values.shape # follow the steps same as with image features in DepthProEncoder @@ -1316,11 +1348,11 @@ def forward( encoder_outputs = self.encoder( patches, head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, + return_dict=True, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.encoder_neck(last_hidden_state) last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) last_hidden_state = merge( @@ -1335,15 +1367,7 @@ def forward( fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(B) - if not return_dict: - head_outputs = (fov_output,) - return head_outputs + encoder_outputs[1:] - - return BaseModelOutput( - last_hidden_state=fov_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + return fov_output class DepthProDepthEstimationHead(nn.Module): @@ -1377,16 +1401,36 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: @dataclass -class DepthProDepthEstimatorOutput(DepthEstimatorOutput): +class DepthProDepthEstimatorOutput(ModelOutput): """ - Base class for outputs of DepthProDepthEstimator. + Base class for DepthProForDepthEstimation's output. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): + Predicted depth for each pixel. + fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. """ + loss: Optional[torch.FloatTensor] = None + predicted_depth: torch.FloatTensor = None fov: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @add_start_docstrings( @@ -1502,41 +1546,26 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, ) - last_hidden_state = depth_pro_outputs.last_hidden_state - last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)] - fused_state = self.fusion_stage(last_hidden_state) - predicted_depth = self.head(fused_state) + features = depth_pro_outputs.features + features = [proj(feature) for proj, feature in zip(self.projections, features)] + fused_features = self.fusion_stage(features) + predicted_depth = self.head(fused_features) - if self.use_fov_model: + fov = self.fov_model( + pixel_values=pixel_values, # use lowest scaled image features for fov model - global_features = last_hidden_state[0].detach() - fov_encodings = self.fov_model( - pixel_values=pixel_values, - global_features=global_features, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - fov = fov_encodings.last_hidden_state - attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = ( - depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None - ) - else: - fov = None - attentions = depth_pro_outputs.attentions - hidden_states = depth_pro_outputs.hidden_states + global_features=features[0].detach(), + head_mask=head_mask, + ) if self.use_fov_model else None if not return_dict: - outputs = (predicted_depth, fov, hidden_states, attentions) - outputs = (i for i in outputs if i is not None) - return outputs + outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] + return tuple(v for v in outputs if v is not None) return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, fov=fov, - hidden_states=hidden_states, - attentions=attentions, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, ) diff --git a/tests/models/depth_pro/__init__.py b/tests/models/depth_pro/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py new file mode 100644 index 00000000000000..3d37965dcd1bd0 --- /dev/null +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -0,0 +1,335 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DepthPro model.""" + +import unittest + +from transformers import DepthProConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DepthProForDepthEstimation, DepthProModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DepthProImageProcessor + + +class DepthProModelTester: + def __init__( + self, + parent, + batch_size=8, + image_size=64, + patch_size=8, + patch_embeddings_size=4, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + fusion_hidden_size=16, + intermediate_hook_ids=[1, 0], + intermediate_feature_dims=[8, 8], + scaled_images_ratios=[0.5, 1.0], + scaled_images_overlap_ratios=[0.0, 0.2], + scaled_images_feature_dims=[12, 12], + num_hidden_layers=2, + num_attention_heads=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + use_fov_model=True, + num_labels=3, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.patch_embeddings_size = patch_embeddings_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.fusion_hidden_size = fusion_hidden_size + self.intermediate_hook_ids = intermediate_hook_ids + self.intermediate_feature_dims = intermediate_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.use_fov_model = use_fov_model + self.num_labels = num_labels + + self.num_patches = (patch_size // patch_embeddings_size) ** 2 + self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DepthProConfig( + image_size=self.image_size, + patch_size=self.patch_size, + patch_embeddings_size=self.patch_embeddings_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + fusion_hidden_size=self.fusion_hidden_size, + intermediate_hook_ids=self.intermediate_hook_ids, + intermediate_feature_dims=self.intermediate_feature_dims, + scaled_images_ratios=self.scaled_images_ratios, + scaled_images_overlap_ratios=self.scaled_images_overlap_ratios, + scaled_images_feature_dims=self.scaled_images_feature_dims, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + use_fov_model=self.use_fov_model, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DepthProModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DepthProForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DepthPro does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DepthProModel, DepthProForDepthEstimation) if is_torch_available() else () + pipeline_model_mapping = ( + { + "depth-estimation": DepthProForDepthEstimation, + "image-feature-extraction": DepthProModel, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DepthProModelTester(self) + self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DepthPro does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DepthProViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @slow + def test_model_from_pretrained(self): + model_name = "Intel/depth_pro-large" + model = DepthProModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DepthProModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") + model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) + + def test_post_processing_depth_estimation(self): + image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") + model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] + expected_shape = torch.Size((384, 384)) + self.assertTrue(predicted_depth.shape == expected_shape) + + predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) + predicted_depth_l = predicted_depth_l[0]["predicted_depth"] + expected_shape = torch.Size((500, 500)) + self.assertTrue(predicted_depth_l.shape == expected_shape) + + output_enlarged = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False + ).squeeze() + self.assertTrue(output_enlarged.shape == expected_shape) + self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3)) From 16a39178307e3d2b484fb0df44e3ff05e0b67aff Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 02:20:22 +0500 Subject: [PATCH 43/72] more fixes --- docs/source/en/model_doc/depth_pro.md | 19 +++++++------------ .../depth_pro/configuration_depth_pro.py | 10 ---------- .../models/depth_pro/modeling_depth_pro.py | 4 ++-- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 6472cc506dae72..7e4ac13f1d648f 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -91,17 +91,17 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DepthProConfig -## DepthProFeatureExtractor - -[[autodoc]] DepthProFeatureExtractor - - __call__ - - post_process_semantic_segmentation - ## DepthProImageProcessor [[autodoc]] DepthProImageProcessor - preprocess - - post_process_semantic_segmentation + - post_process_depth_estimation + +## DepthProImageProcessorFast + +[[autodoc]] DepthProImageProcessorFast + - preprocess + - post_process_depth_estimation ## DepthProModel @@ -112,8 +112,3 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DepthProForDepthEstimation - forward - -## DepthProForSemanticSegmentation - -[[autodoc]] DepthProForSemanticSegmentation - - forward diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index beb3215d8ddf8d..46220a0731e6f7 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -72,12 +72,6 @@ class DepthProConfig(PretrainedConfig): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): Whether to use the SwiGLU feedforward neural network. - apply_layernorm (`bool`, *optional*, defaults to `True`): - Whether to apply layer normalization to the feature maps in case the model is used as backbone. - reshape_hidden_states (`bool`, *optional*, defaults to `True`): - Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in - case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, - seq_len, hidden_size)`. intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`): Indices of the intermediate hidden states from the patch encoder to use for fusion. intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`): @@ -134,8 +128,6 @@ def __init__( layerscale_value=1.0, drop_path_rate=0.0, use_swiglu_ffn=False, - apply_layernorm=True, - reshape_hidden_states=True, intermediate_hook_ids=[11, 5], intermediate_feature_dims=[256, 256], scaled_images_ratios=[0.25, 0.5, 1], @@ -167,8 +159,6 @@ def __init__( self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate self.use_swiglu_ffn = use_swiglu_ffn - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual self.use_bias_in_fusion_residual = use_bias_in_fusion_residual self.use_fov_model = use_fov_model diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 2e074588d4e301..27754c5dbafcbf 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -22,16 +22,16 @@ from torch import nn from ...activations import ACT2FN -from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( + ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, torch_int, - ModelOutput, ) from .configuration_depth_pro import DepthProConfig From 2408ec54e4f27d2abbecdb8374e58f34d91d8e96 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 12:18:09 +0500 Subject: [PATCH 44/72] use output states from image_encoder instead --- .../models/depth_pro/modeling_depth_pro.py | 49 ++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 27754c5dbafcbf..00241bb8646582 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = self.position_embeddings.shape[1] - 1 + num_positions = embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - sqrt_num_positions = torch_int(num_positions**0.5) - patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) + patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype @@ -734,6 +734,7 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.fusion_hidden_size = config.fusion_hidden_size + self.patch_size = config.patch_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -867,7 +868,7 @@ def forward( patch_encodings = self.patch_encoder( patches, head_mask=head_mask, - output_attentions=output_attentions, + output_attentions=False, output_hidden_states=True, # required for intermediate features return_dict=True, ) @@ -875,11 +876,18 @@ def forward( patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first + # scale the image to patch size for image_encoder + scaled_image_to_patch_size = nn.functional.interpolate( + pixel_values, + size=(self.patch_size, self.patch_size), + mode="bilinear", + align_corners=False, + ) image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=scaled_image_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, return_dict=True, ) @@ -946,19 +954,15 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + ) # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (B, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = merge( - image_features, - batch_size=B, - merge_out_size=self.out_size * 2 ** (0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + # no merge required for image_features as they are already in batches instead of patches # d. upsample image_features = self.upsample_image( @@ -980,20 +984,9 @@ def forward( # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = patch_encodings.last_hidden_state - hidden_states = patch_encodings.hidden_states if output_hidden_states else None - attentions = patch_encodings.attentions if output_attentions else None - - num_patches = sum(scaled_images_num_patches) - # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 - indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T - indexes = indexes.to(last_hidden_state.device) - - last_hidden_state = last_hidden_state[indexes].mean(1) - if hidden_states is not None: - hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) - if attentions is not None: - attentions = tuple([state[indexes].mean(1) for state in attentions]) + last_hidden_state = image_encodings.last_hidden_state + hidden_states = image_encodings.hidden_states if output_hidden_states else None + attentions = image_encodings.attentions if output_attentions else None if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) From be0c2a37478589c31d5b3864f16b955f952b43cd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 09:13:25 +0500 Subject: [PATCH 45/72] Revert "use output states from image_encoder instead" This reverts commit 2408ec54e4f27d2abbecdb8374e58f34d91d8e96. --- .../models/depth_pro/modeling_depth_pro.py | 49 +++++++++++-------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 00241bb8646582..27754c5dbafcbf 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) - patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype @@ -734,7 +734,6 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.fusion_hidden_size = config.fusion_hidden_size - self.patch_size = config.patch_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -868,7 +867,7 @@ def forward( patch_encodings = self.patch_encoder( patches, head_mask=head_mask, - output_attentions=False, + output_attentions=output_attentions, output_hidden_states=True, # required for intermediate features return_dict=True, ) @@ -876,18 +875,11 @@ def forward( patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first - # scale the image to patch size for image_encoder - scaled_image_to_patch_size = nn.functional.interpolate( - pixel_values, - size=(self.patch_size, self.patch_size), - mode="bilinear", - align_corners=False, - ) image_encodings = self.image_encoder( - pixel_values=scaled_image_to_patch_size, + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, return_dict=True, ) @@ -954,15 +946,19 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (B, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - # no merge required for image_features as they are already in batches instead of patches + image_features = merge( + image_features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample image_features = self.upsample_image( @@ -984,9 +980,20 @@ def forward( # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = image_encodings.last_hidden_state - hidden_states = image_encodings.hidden_states if output_hidden_states else None - attentions = image_encodings.attentions if output_attentions else None + last_hidden_state = patch_encodings.last_hidden_state + hidden_states = patch_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions if output_attentions else None + + num_patches = sum(scaled_images_num_patches) + # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 + indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T + indexes = indexes.to(last_hidden_state.device) + + last_hidden_state = last_hidden_state[indexes].mean(1) + if hidden_states is not None: + hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) + if attentions is not None: + attentions = tuple([state[indexes].mean(1) for state in attentions]) if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) From efed39f86e629a56df892f45dcbb5d4dc05222a4 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 09:18:16 +0500 Subject: [PATCH 46/72] make embeddings dynamic --- src/transformers/models/depth_pro/modeling_depth_pro.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 27754c5dbafcbf..4f97f37230cbbb 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -80,6 +80,7 @@ class DepthProViTEmbeddings(nn.Module): """ Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings except antialias=True in interpolation and removal of mask_token + and enabling dynamic embeddings. """ def __init__(self, config: DepthProConfig) -> None: @@ -103,7 +104,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = self.position_embeddings.shape[1] - 1 + num_positions = embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - sqrt_num_positions = torch_int(num_positions**0.5) - patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) + patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype From c3b14fbcc54a1877bf6ebb7b7b61d9d67f1753ce Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 10:58:45 +0500 Subject: [PATCH 47/72] reshape output hidden states and attentions as part of computation graph --- .../models/depth_pro/modeling_depth_pro.py | 114 +++++++++++++----- .../depth_pro/test_modeling_depth_pro.py | 3 +- 2 files changed, 88 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 4f97f37230cbbb..6f20838375cf84 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -42,6 +42,25 @@ _CONFIG_FOR_DOC = "DepthProConfig" +def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor: + """ + converts tensor from shape: + (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size) + """ + data = data.reshape(-1, batch_size, *data.shape[1:]) + data = data.transpose(0, 1) + return data + +def batch_to_patch(data: torch.Tensor) -> torch.Tensor: + """ + converts tensor from shape: + (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size) + """ + data = data.transpose(0, 1) + data = data.reshape(-1, *data.shape[2:]) + return data + + class DepthProViTPatchEmbeddings(nn.Module): """ Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings @@ -135,13 +154,17 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: return torch.cat((class_pos_embed, patch_pos_embed), dim=1) - def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: - batch_size, _, height, width = pixel_values.shape + def forward( + self, + pixel_values: torch.Tensor, + batch_size: Optional[int] = None, + ) -> torch.Tensor: + n, _, height, width = pixel_values.shape target_dtype = self.patch_embeddings.projection.weight.dtype embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) # add the [CLS] token to the embedded patch tokens - cls_tokens = self.cls_token.expand(batch_size, -1, -1) + cls_tokens = self.cls_token.expand(n, -1, -1) embeddings = torch.cat((cls_tokens, embeddings), dim=1) # add positional encoding to each token @@ -149,11 +172,14 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: embeddings = self.dropout(embeddings) + if batch_size is not None: + embeddings = patch_to_batch(embeddings, batch_size) + return embeddings -# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro class DepthProViTSelfAttention(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro def __init__(self, config: DepthProConfig) -> None: super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): @@ -172,13 +198,20 @@ def __init__(self, config: DepthProConfig) -> None: self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(new_x_shape) return x.permute(0, 2, 1, 3) + # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro + # with the addition of `batch_size` def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: mixed_query_layer = self.query(hidden_states) @@ -202,25 +235,37 @@ def forward( if head_mask is not None: attention_probs = attention_probs * head_mask - context_layer = torch.matmul(attention_probs, value_layer) + if batch_size is not None: + attention_probs_batched = patch_to_batch(attention_probs, batch_size) + attention_probs_patched = batch_to_patch(attention_probs_batched) + else: + attention_probs_patched = attention_probs_batched = attention_probs + + context_layer = torch.matmul(attention_probs_patched, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(new_context_layer_shape) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + outputs = (context_layer, attention_probs_batched) if output_attentions else (context_layer,) return outputs -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__(config) self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT + # with the addition of `batch_size` def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -229,7 +274,7 @@ def forward( 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( - hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size, ) mixed_query_layer = self.query(hidden_states) @@ -274,14 +319,15 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTAttention(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__() self.attention = DepthProViTSelfAttention(config) self.output = DepthProViTSelfOutput(config) self.pruned_heads = set() + # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads def prune_heads(self, heads: Set[int]) -> None: if len(heads) == 0: return @@ -300,13 +346,16 @@ def prune_heads(self, heads: Set[int]) -> None: self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) + # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - self_outputs = self.attention(hidden_states, head_mask, output_attentions) + self_outputs = self.attention(hidden_states, head_mask, output_attentions, batch_size) attention_output = self.output(self_outputs[0], hidden_states) @@ -411,10 +460,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: } -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing class DepthProViTLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -431,16 +480,23 @@ def __init__(self, config: DepthProConfig) -> None: self.mlp = DepthProViTMLP(config) self.layer_scale2 = DepthProViTLayerScale(config) + # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if batch_size is not None: + hidden_states = batch_to_patch(hidden_states) + self_attention_outputs = self.attention( self.norm1(hidden_states), # in DepthProViT, layernorm is applied before self-attention head_mask, output_attentions=output_attentions, + batch_size=batch_size, ) attention_output = self_attention_outputs[0] @@ -458,19 +514,24 @@ def forward( # second residual connection layer_output = self.drop_path(layer_output) + hidden_states + if batch_size is not None: + layer_output = patch_to_batch(layer_output, batch_size) + outputs = (layer_output,) + outputs return outputs -# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTEncoder(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__ + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, @@ -478,6 +539,7 @@ def forward( output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, + batch_size: Optional[int] = None, ) -> Union[tuple, BaseModelOutput]: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -494,9 +556,10 @@ def forward( hidden_states, layer_head_mask, output_attentions, + batch_size, ) else: - layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, batch_size) hidden_states = layer_outputs[0] @@ -532,6 +595,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + batch_size: Optional[int] = None, ) -> Union[Tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -542,7 +606,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - embedding_output = self.embeddings(pixel_values) + embedding_output = self.embeddings(pixel_values, batch_size=batch_size) encoder_outputs = self.encoder( embedding_output, @@ -550,6 +614,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + batch_size=batch_size, ) sequence_output = encoder_outputs[0] sequence_output = self.layernorm(sequence_output) @@ -871,9 +936,12 @@ def forward( output_attentions=output_attentions, output_hidden_states=True, # required for intermediate features return_dict=True, + batch_size=B, ) + last_hidden_state = patch_encodings.last_hidden_state + last_hidden_state = batch_to_patch(last_hidden_state) scaled_images_last_hidden_state = torch.split_with_sizes( - patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] + last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( @@ -917,6 +985,7 @@ def forward( self.intermediate_hook_ids[i] + 1 ) # +1 to correct index position as hidden_states contain embedding output as well hidden_state = patch_encodings.hidden_states[layer_id] + hidden_state = batch_to_patch(hidden_state) hidden_state = hidden_state[ : scaled_images_num_patches[-1] ] # num_patches to be of same length as highest resolution @@ -985,17 +1054,6 @@ def forward( hidden_states = patch_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions if output_attentions else None - num_patches = sum(scaled_images_num_patches) - # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 - indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T - indexes = indexes.to(last_hidden_state.device) - - last_hidden_state = last_hidden_state[indexes].mean(1) - if hidden_states is not None: - hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) - if attentions is not None: - attentions = tuple([state[indexes].mean(1) for state in attentions]) - if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 3d37965dcd1bd0..9e881cf273b7b9 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -131,7 +131,8 @@ def create_and_check_model(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)) def create_and_check_for_depth_estimation(self, config, pixel_values, labels): config.num_labels = self.num_labels From 7cf2485adef235b906b469a38002a8dacc3d0537 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 11:14:21 +0500 Subject: [PATCH 48/72] fix ruff formating --- .../depth_pro/image_processing_depth_pro.py | 18 ++++++---- .../models/depth_pro/modeling_depth_pro.py | 36 +++++++++++-------- .../depth_pro/test_modeling_depth_pro.py | 8 +++-- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 65a29900c63744..164c7e28c6e237 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -157,13 +157,17 @@ def resize( raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - return torch.nn.functional.interpolate( - # input should be (B, C, H, W) - input=torch.from_numpy(image).unsqueeze(0), - size=output_size, - mode=pil_torch_interpolation_mapping[resample].value, - antialias=antialias, - ).squeeze(0).numpy() + return ( + torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=torch.from_numpy(image).unsqueeze(0), + size=output_size, + mode=pil_torch_interpolation_mapping[resample].value, + antialias=antialias, + ) + .squeeze(0) + .numpy() + ) def _validate_input_arguments( self, diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 6f20838375cf84..8fa286c70919f3 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -51,6 +51,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor: data = data.transpose(0, 1) return data + def batch_to_patch(data: torch.Tensor) -> torch.Tensor: """ converts tensor from shape: @@ -155,10 +156,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward( - self, - pixel_values: torch.Tensor, - batch_size: Optional[int] = None, - ) -> torch.Tensor: + self, + pixel_values: torch.Tensor, + batch_size: Optional[int] = None, + ) -> torch.Tensor: n, _, height, width = pixel_values.shape target_dtype = self.patch_embeddings.projection.weight.dtype embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) @@ -274,7 +275,10 @@ def forward( 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( - hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size, + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + batch_size=batch_size, ) mixed_query_layer = self.query(hidden_states) @@ -940,9 +944,9 @@ def forward( ) last_hidden_state = patch_encodings.last_hidden_state last_hidden_state = batch_to_patch(last_hidden_state) - scaled_images_last_hidden_state = torch.split_with_sizes( - last_hidden_state, scaled_images_num_patches[::-1] - )[::-1] # -1 as patch encoder expects high res patches first + scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1]) + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( pixel_values=scaled_images[0], # provide least resolution image @@ -1610,12 +1614,16 @@ def forward( fused_features = self.fusion_stage(features) predicted_depth = self.head(fused_features) - fov = self.fov_model( - pixel_values=pixel_values, - # use lowest scaled image features for fov model - global_features=features[0].detach(), - head_mask=head_mask, - ) if self.use_fov_model else None + fov = ( + self.fov_model( + pixel_values=pixel_values, + # use lowest scaled image features for fov model + global_features=features[0].detach(), + head_mask=head_mask, + ) + if self.use_fov_model + else None + ) if not return_dict: outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 9e881cf273b7b9..e350b067a118c8 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -91,7 +91,7 @@ def __init__( self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 - self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -131,8 +131,10 @@ def create_and_check_model(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)) + num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size) + ) def create_and_check_for_depth_estimation(self, config, pixel_values, labels): config.num_labels = self.num_labels From 0aa451df3e6862291d2097d5a1e6aa5e9aa91f23 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 22:41:11 +0500 Subject: [PATCH 49/72] fix docstring failure --- .../models/depth_pro/modeling_depth_pro.py | 16 +++++++++++++++- utils/check_docstrings.py | 1 - utils/check_repo.py | 1 - 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 8fa286c70919f3..1498ce4003d39b 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1496,11 +1496,25 @@ class DepthProDepthEstimatorOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor, ...]] = None +DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + use_fov_model (`bool`, *optional*, defaults to `True`): + Whether to use `DepthProFOVModel` to generate the field of view. +""" + + @add_start_docstrings( """ DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). """, - DEPTH_PRO_START_DOCSTRING, + DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING, ) class DepthProForDepthEstimation(DepthProPreTrainedModel): def __init__(self, config, use_fov_model=None): diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 34deed0df47e01..0be960f4a33e6d 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -140,7 +140,6 @@ "DPRReaderTokenizer", "DPRReaderTokenizerFast", "DPTModel", - "DepthProModel", "Data2VecAudioConfig", "Data2VecTextConfig", "Data2VecTextModel", diff --git a/utils/check_repo.py b/utils/check_repo.py index 2e131e8791530e..10be5cdcd26230 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -213,7 +213,6 @@ "JukeboxPrior", "SamModel", "DPTForDepthEstimation", - "DepthProForDepthEstimation", "DecisionTransformerGPT2Model", "GLPNForDepthEstimation", "ViltForImagesAndTextClassification", From 160afbf57789906a134000a5b6ee99982cf4ae6f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 23:32:39 +0500 Subject: [PATCH 50/72] use num_fov_head_layers in tests --- tests/models/depth_pro/test_modeling_depth_pro.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index e350b067a118c8..03f69e8ad1fee5 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -64,6 +64,7 @@ def __init__( attention_probs_dropout_prob=0.1, initializer_range=0.02, use_fov_model=True, + num_fov_head_layers=0, num_labels=3, ): self.parent = parent @@ -88,6 +89,7 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 @@ -124,6 +126,7 @@ def get_config(self): attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, use_fov_model=self.use_fov_model, + num_fov_head_layers=self.num_fov_head_layers, ) def create_and_check_model(self, config, pixel_values, labels): From 9d2be2603d9a75346526b2a37711c6edc40125c8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:30:08 +0500 Subject: [PATCH 51/72] update doc --- docs/source/en/model_doc/depth_pro.md | 37 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 7e4ac13f1d648f..041c4d49dffc93 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -26,7 +26,7 @@ The abstract from the paper is the following: *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* -drawing DepthPro architecture. Taken from the original paper. @@ -38,16 +38,26 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi ## Usage tips ```python -from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation +from transformers import DepthProConfig, DepthProForDepthEstimation -# initialize with a Transformer-based backbone such as DINOv2 -# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width) -backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False) +config = DepthProConfig() +model = DPTForDepthEstimation(config=config) +``` + +- By default model takes an input image of size `1536`, this can be changed via config, however the model is compatible with images of different width and height. +- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, then each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`. +- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder. +- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config. +- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size. +- To generate `predicted_depth` of the same size as input image, make sure the config is created such that +``` +image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size -config = DepthProConfig(backbone_config=backbone_config) -model = DepthProForDepthEstimation(config=config) +where +n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) ``` + ### Using Scaled Dot Product Attention (SDPA) PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function @@ -59,9 +69,9 @@ page for more information. SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` -from transformers import ViTForImageClassification -model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +```py +from transformers import DepthProForDepthEstimation +model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16) ... ``` @@ -78,12 +88,11 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` ## Resources -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. +- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073) -- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro). +- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) -- [Semantic segmentation task guide](../tasks/semantic_segmentation) -- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) + If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. From e208459cebe6b8f821aa14e0d9e7735466751daf Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:38:32 +0500 Subject: [PATCH 52/72] check consistency with config --- .../models/depth_pro/modeling_depth_pro.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 1498ce4003d39b..605ea38ea736e7 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -838,6 +838,23 @@ def __init__(self, config: DepthProConfig) -> None: f"by patch_embeddings_size={config.patch_embeddings_size}." ) + # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent + if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)): + raise ValueError( + f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and " + f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and " + f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, " + f"should match in config." + ) + + # intermediate_hook_ids, intermediate_feature_dims are consistent + if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)): + raise ValueError( + f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and " + f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, " + f"should match in config." + ) + # patch encoder self.patch_encoder = DepthProViT(config) From 0415722bd6dd44f4b7d56d0cacf8cdd3f958cb41 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:42:31 +0500 Subject: [PATCH 53/72] ruff formatting --- src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 605ea38ea736e7..040b9eb07962e6 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -839,7 +839,11 @@ def __init__(self, config: DepthProConfig) -> None: ) # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent - if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)): + if not ( + len(config.scaled_images_ratios) + == len(config.scaled_images_overlap_ratios) + == len(config.scaled_images_feature_dims) + ): raise ValueError( f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and " f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and " From f4e7404191244a86a91d5e93c3be82ffa7d6b970 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 10:57:52 +0500 Subject: [PATCH 54/72] update test case --- tests/models/depth_pro/test_modeling_depth_pro.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 03f69e8ad1fee5..54c5e870a258f3 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -52,12 +52,12 @@ def __init__( use_labels=True, hidden_size=32, fusion_hidden_size=16, - intermediate_hook_ids=[1, 0], - intermediate_feature_dims=[8, 8], + intermediate_hook_ids=[0], + intermediate_feature_dims=[8], scaled_images_ratios=[0.5, 1.0], scaled_images_overlap_ratios=[0.0, 0.2], scaled_images_feature_dims=[12, 12], - num_hidden_layers=2, + num_hidden_layers=1, num_attention_heads=4, hidden_act="gelu", hidden_dropout_prob=0.1, @@ -95,6 +95,9 @@ def __init__( self.num_patches = (patch_size // patch_embeddings_size) ** 2 self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) + self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size + def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -145,7 +148,7 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() From 2c1cc10ee8ddefce3649dac81144e5095ee00ba8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 14:55:06 +0500 Subject: [PATCH 55/72] fix ruff formatting --- tests/models/depth_pro/test_modeling_depth_pro.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 54c5e870a258f3..215756d45e99b9 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -96,7 +96,7 @@ def __init__( self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size + self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -148,7 +148,9 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)) + self.parent.assertEqual( + result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size) + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() From 871b80db318a8e8b2b70533acd62cbcec678cc74 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 10:42:02 +0500 Subject: [PATCH 56/72] add tests for fov --- .../depth_pro/test_modeling_depth_pro.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 215756d45e99b9..48983c9aca3a36 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -63,8 +63,7 @@ def __init__( hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, - use_fov_model=True, - num_fov_head_layers=0, + use_fov_model=False, num_labels=3, ): self.parent = parent @@ -89,7 +88,6 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.use_fov_model = use_fov_model - self.num_fov_head_layers = num_fov_head_layers self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 @@ -129,7 +127,6 @@ def get_config(self): attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, use_fov_model=self.use_fov_model, - num_fov_head_layers=self.num_fov_head_layers, ) def create_and_check_model(self, config, pixel_values, labels): @@ -152,6 +149,36 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size) ) + def create_and_check_for_fov(self, config, pixel_values, labels): + model = DepthProForDepthEstimation(config, use_fov_model=True) + model.to(torch_device) + model.eval() + + # check if the fov_model (DinoV2-based encoder) is created + self.parent.assertIsNotNone(model.fov_model) + + batched_pixel_values = pixel_values + row_pixel_values = pixel_values[:1] + + with torch.no_grad(): + model_batched_output_fov = model(batched_pixel_values).fov + model_row_output_fov = model(row_pixel_values).fov + + # check if fov is returned + self.parent.assertIsNotNone(model_batched_output_fov) + self.parent.assertIsNotNone(model_row_output_fov) + + # check output shape consistency for fov + self.parent.assertEqual(model_batched_output_fov.shape, (self.batch_size,)) + + # check equivalence between batched and single row outputs for fov + diff = torch.max(torch.abs(model_row_output_fov - model_batched_output_fov[:1])) + model_name = model.__class__.__name__ + self.parent.assertTrue( + diff <= 1e-03, + msg=(f"Batched and Single row outputs are not equal in {model_name} for fov. " f"Difference={diff}."), + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values, labels = config_and_inputs @@ -208,6 +235,10 @@ def test_for_depth_estimation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + def test_for_fov(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_fov(*config_and_inputs) + def test_training(self): for model_class in self.all_model_classes: if model_class.__name__ == "DepthProForDepthEstimation": From 0ff06556163a39f90eede4d5e889554e46b9de46 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 15:11:06 +0500 Subject: [PATCH 57/72] use interpolation in postprocess --- .../models/depth_pro/image_processing_depth_pro.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 164c7e28c6e237..228c3d992457e4 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -393,10 +393,11 @@ def post_process_depth_estimation( outputs["fov"].append(fov) # interpolate - predicted_depth = self.resize( - predicted_depth.unsqueeze(0).unsqueeze(1), + predicted_depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=predicted_depth.unsqueeze(0).unsqueeze(1), size=target_size, - resample=self.resample, + mode=pil_torch_interpolation_mapping[self.resample].value, antialias=self.antialias, ).squeeze() From befa6cdbca6194a4fab82c9865bfb9deeebe54c7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 15:26:50 +0500 Subject: [PATCH 58/72] run and fix slow tests locally --- .../depth_pro/test_modeling_depth_pro.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 48983c9aca3a36..a3026801d59379 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -94,7 +94,7 @@ def __init__( self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size + self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size // patch_embeddings_size def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -313,8 +313,8 @@ def test_initialization(self): @slow def test_model_from_pretrained(self): - model_name = "Intel/depth_pro-large" - model = DepthProModel.from_pretrained(model_name) + model_path = "geetu040/DepthPro" + model = DepthProModel.from_pretrained(model_path) self.assertIsNotNone(model) @@ -329,8 +329,10 @@ def prepare_img(): @slow class DepthProModelIntegrationTest(unittest.TestCase): def test_inference_depth_estimation(self): - image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") - model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device) + model_path = "geetu040/DepthPro" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) + config = model.config image = prepare_img() inputs = image_processor(images=image, return_tensors="pt").to(torch_device) @@ -341,18 +343,21 @@ def test_inference_depth_estimation(self): predicted_depth = outputs.predicted_depth # verify the predicted depth - expected_shape = torch.Size((1, 384, 384)) + n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + expected_depth_size = 2 ** (n_fusion_blocks + 1) * config.patch_size // config.patch_embeddings_size + expected_shape = torch.Size((1, expected_depth_size, expected_depth_size)) self.assertEqual(predicted_depth.shape, expected_shape) expected_slice = torch.tensor( - [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) def test_post_processing_depth_estimation(self): - image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") - model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large") + model_path = "geetu040/DepthPro" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path) image = prepare_img() inputs = image_processor(images=image, return_tensors="pt") @@ -361,17 +366,15 @@ def test_post_processing_depth_estimation(self): with torch.no_grad(): outputs = model(**inputs) - predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] - expected_shape = torch.Size((384, 384)) - self.assertTrue(predicted_depth.shape == expected_shape) - - predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) - predicted_depth_l = predicted_depth_l[0]["predicted_depth"] - expected_shape = torch.Size((500, 500)) - self.assertTrue(predicted_depth_l.shape == expected_shape) + predicted_depth = outputs.predicted_depth + fov = outputs.fov + target_size = [[image.height, image.width]] * len(predicted_depth) - output_enlarged = torch.nn.functional.interpolate( - predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False - ).squeeze() - self.assertTrue(output_enlarged.shape == expected_shape) - self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3)) + outputs = image_processor.post_process_depth_estimation( + predicted_depths=predicted_depth, + fovs=fov, + target_sizes=target_size, + ) + predicted_depth = outputs["predicted_depth"][0] + expected_shape = torch.Size((image.height, image.width)) + self.assertTrue(predicted_depth.shape == expected_shape) From 99ac5e81cc98b9297a81af784bf227179f1609e3 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 19:53:22 +0500 Subject: [PATCH 59/72] use scaled_images_features for image and fov encoder --- .../models/depth_pro/modeling_depth_pro.py | 80 ++++++++++--------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 040b9eb07962e6..f77e24925c88b1 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -959,7 +959,8 @@ def forward( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=True, # required for intermediate features + # required for intermediate features + output_hidden_states=self.n_intermediate_hooks or output_hidden_states, return_dict=True, batch_size=B, ) @@ -969,12 +970,16 @@ def forward( scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] # -1 as patch encoder expects high res patches first + # scale the image to patch size for image_encoder + image_scaled_to_patch_size = nn.functional.interpolate( + pixel_values, + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, + ) image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=image_scaled_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, - return_dict=True, ) # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram @@ -1041,19 +1046,15 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + ) # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (B, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = merge( - image_features, - batch_size=B, - merge_out_size=self.out_size * 2 ** (0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + # no merge required for image_features as they are already in batches instead of patches # d. upsample image_features = self.upsample_image( @@ -1073,8 +1074,6 @@ def forward( *intermediate_features, ] - # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = patch_encodings.last_hidden_state hidden_states = patch_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions if output_attentions else None @@ -1420,35 +1419,42 @@ def forward( B, C, W, H = pixel_values.shape # follow the steps same as with image features in DepthProEncoder - pixel_values = interpolate( - pixel_values, - scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image - ) - patches = patch( + # except for the extra encoder_neck layer applied + + image_scaled_to_patch_size = nn.functional.interpolate( pixel_values, - patch_size=self.config.patch_size, - overlap_ratio=self.config.scaled_images_overlap_ratios[0], + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, ) - encoder_outputs = self.encoder( - patches, + encodings = self.encoder( + image_scaled_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ) - last_hidden_state = encoder_outputs.last_hidden_state - last_hidden_state = self.encoder_neck(last_hidden_state) - last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) - last_hidden_state = merge( - last_hidden_state, - batch_size=B, - merge_out_size=self.out_size, ) + # a. extract hidden_state + hidden_state = ( + encodings.last_hidden_state + ) # (B, self.seq_len+1, config.hidden_size) + # extra step + hidden_state = self.encoder_neck(hidden_state) + # (B, self.fusion_hidden_size//2, self.out_size, self.out_size) + + # b. reshape back to image like + fov_features = reshape_feature( + hidden_state, self.out_size, self.out_size + ) # (B, config.hidden_size, self.out_size, self.out_size) + + # c. merge patches back together + # no merge required for fov_features as they are already in batches instead of patches + + # d. upsample + # no upsampling required for fov_features, the head later downsamples to create scalars + global_features = self.global_neck(global_features) - last_hidden_state = last_hidden_state + global_features - fov_output = self.head(last_hidden_state) + fov_features = fov_features + global_features + fov_output = self.head(fov_features) fov_output = fov_output.reshape(B) return fov_output @@ -1652,7 +1658,7 @@ def forward( fov = ( self.fov_model( pixel_values=pixel_values, - # use lowest scaled image features for fov model + # frozon features from encoder are used global_features=features[0].detach(), head_mask=head_mask, ) From ebb62dd2190a164d8f4cfbb218cd7c2099515ae1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:28:32 +0500 Subject: [PATCH 60/72] return fused_hidden_states in fusion stage --- .../models/depth_pro/modeling_depth_pro.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f77e24925c88b1..91758a3db485fb 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -765,7 +765,6 @@ def merge(patches, batch_size, merge_out_size): boxes.append(boxes_in_row) boxes = torch.cat(boxes, dim=-2) - boxes = boxes[..., :merge_out_size, :merge_out_size] return boxes @@ -1303,7 +1302,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer -# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs) +# except it uses deconv annd skip_add class DepthProFeatureFusionLayer(nn.Module): def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: super().__init__() @@ -1328,6 +1327,10 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: def forward(self, hidden_state, residual=None): if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual)) hidden_state = self.residual_layer2(hidden_state) @@ -1357,13 +1360,17 @@ def forward(self, hidden_states): f"doesnot match len(hidden_states)={len(hidden_states)}" ) - # first layer only uses the last hidden_state - fused_hidden_state = self.layers[0](hidden_states[0]) - # looping from the second layer to last layer - for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): - fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states, self.layers): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) - return fused_hidden_state + return fused_hidden_states class DepthProFOVModel(nn.Module): @@ -1652,8 +1659,8 @@ def forward( ) features = depth_pro_outputs.features features = [proj(feature) for proj, feature in zip(self.projections, features)] - fused_features = self.fusion_stage(features) - predicted_depth = self.head(fused_features) + fused_hidden_states = self.fusion_stage(features) + predicted_depth = self.head(fused_hidden_states[-1]) fov = ( self.fov_model( From 46c88e8bd3ba4dc2331b81fad1a54a4b902445e7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:44:44 +0500 Subject: [PATCH 61/72] fix example --- .../models/depth_pro/modeling_depth_pro.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 91758a3db485fb..8f1609b6fb1514 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1206,14 +1206,8 @@ def forward( >>> with torch.no_grad(): ... output = model(**inputs) ... - >>> for state in output.last_hidden_state: - ... print(state.shape) - ... - torch.Size([1, 1024, 48, 48]) - torch.Size([1, 1024, 96, 96]) - torch.Size([1, 512, 192, 192]) - torch.Size([1, 256, 384, 384]) - torch.Size([1, 256, 768, 768]) + >>> output.last_hidden_state.shape + torch.Size([1, 35, 577, 1024]) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( From 243135880028d09441fb41440f760a9a2c329a33 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:48:36 +0500 Subject: [PATCH 62/72] fix ruff --- src/transformers/models/depth_pro/modeling_depth_pro.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 8f1609b6fb1514..bd6c811a1163b0 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1043,9 +1043,7 @@ def forward( # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = ( - image_encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + hidden_state = image_encodings.last_hidden_state # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( @@ -1434,9 +1432,7 @@ def forward( ) # a. extract hidden_state - hidden_state = ( - encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + hidden_state = encodings.last_hidden_state # (B, self.seq_len+1, config.hidden_size) # extra step hidden_state = self.encoder_neck(hidden_state) # (B, self.fusion_hidden_size//2, self.out_size, self.out_size) From d9d3a49906bab33156ab97f8ebb7b2bd87d45a49 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 21 Dec 2024 10:23:09 +0500 Subject: [PATCH 63/72] fix copyright license for all files --- docs/source/en/model_doc/depth_pro.md | 2 +- src/transformers/models/depth_pro/__init__.py | 2 +- src/transformers/models/depth_pro/configuration_depth_pro.py | 2 +- .../models/depth_pro/convert_depth_pro_weights_to_hf.py | 2 +- src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +- .../models/depth_pro/image_processing_depth_pro_fast.py | 2 +- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- tests/models/depth_pro/test_image_processing_depth_pro.py | 2 +- tests/models/depth_pro/test_modeling_depth_pro.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 041c4d49dffc93..9019547434af84 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -1,4 +1,4 @@ -