From 2986dc21201fe1a687badd62d2be667d6b335ffe Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 3 Nov 2024 10:48:55 +0500
Subject: [PATCH 01/72] implement config and model building blocks

---
 .../depth_pro/configuration_depth_pro.py      |  167 ++
 .../models/depth_pro/modeling_depth_pro.py    | 1404 +++++++++++++++++
 2 files changed, 1571 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/configuration_depth_pro.py
 create mode 100644 src/transformers/models/depth_pro/modeling_depth_pro.py

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
new file mode 100644
index 00000000000000..ad0f1016f7a147
--- /dev/null
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DepthPro model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+from transformers.utils.backbone_utils import get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class DepthProConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a
+    DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DepthPro
+    [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import DepthProConfig, DepthProModel
+
+    >>> # Initializing a DepthPro apple/DepthPro style configuration
+    >>> configuration = DepthProConfig()
+
+    >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration
+    >>> model = DepthProModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "depth_pro"
+
+    def __init__(
+        self,
+        hidden_size=1024, # changed
+        decoder_hidden_size=256,
+        num_hidden_layers=24, # changed
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=384,
+        patch_size=16, # changed
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        patch_encoder_hook_ids = [5, 11],
+        # patch_encoder_hook_ids = [5, 11, 17, 23],
+        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+        use_batch_norm_in_decoder=False,
+        use_fov=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.decoder_hidden_size = decoder_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.patch_encoder_hook_ids = patch_encoder_hook_ids
+        self.patch_encoder_feature_dims = patch_encoder_feature_dims
+        self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
+        self.use_fov = use_fov
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
new file mode 100644
index 00000000000000..f73b74060f5778
--- /dev/null
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -0,0 +1,1404 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DepthPro model."""
+
+from icecream import ic
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from dataclasses import dataclass
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+)
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from .configuration_depth_pro import DepthProConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
+class DepthProViTPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings
+# with DepthProViT->DepthProViT and antialias=True in interpolation
+class DepthProViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.patch_embeddings = DepthProViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        target_dtype = patch_pos_embed.dtype
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(torch.float32),
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+            antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings
+        ).to(dtype=target_dtype)
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT
+class DepthProViTSelfAttention(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT
+class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
+            )
+
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, None
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT
+class DepthProViTSelfOutput(nn.Module):
+    """
+    The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT
+class DepthProViTAttention(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.attention = DepthProViTSelfAttention(config)
+        self.output = DepthProViTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT
+class DepthProViTSdpaAttention(DepthProViTAttention):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__(config)
+        self.attention = DepthProViTSdpaSelfAttention(config)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT
+class DepthProViTLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class DepthProViTDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT
+class DepthProViTMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT
+class DepthProViTSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+DEPTHPROVIT_ATTENTION_CLASSES = {
+    "eager": DepthProViTAttention,
+    "sdpa": DepthProViTSdpaAttention,
+}
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT
+class DepthProViTLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = DEPTHPROVIT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.layer_scale1 = DepthProViTLayerScale(config)
+        self.drop_path = DepthProViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = DepthProViTSwiGLUFFN(config)
+        else:
+            self.mlp = DepthProViTMLP(config)
+        self.layer_scale2 = DepthProViTLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in DepthProViT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # in DepthProViT, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT
+class DepthProViTEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class DepthProViT(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = DepthProViTEmbeddings(config)
+        self.encoder = DepthProViTEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        self.out_size = 24 # TODO: image_size // patch_size
+
+        # patch encoder
+        self.patch_encoder = DepthProViT(config)
+        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook(
+            self._intermediate0_hook
+        )
+        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook(
+            self._intermediate1_hook
+        )
+
+        # image encoder
+        self.image_encoder = DepthProViT(config)
+
+        # upsampling features (1-2)
+        self.upsample_intermediate0 = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            intermediate_dims=config.patch_encoder_feature_dims[0],
+            output_dims=config.decoder_hidden_size,
+            n_upsample_layers=3,
+        )
+        self.upsample_intermediate1 = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[0],
+            n_upsample_layers=2,
+        )
+
+        # upsampling features (3-5)
+        self.upsample_high_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[1],
+            n_upsample_layers=1,
+        )
+        self.upsample_med_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[2],
+            n_upsample_layers=1,
+        )
+        self.upsample_low_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[3],
+            n_upsample_layers=1,
+        )
+
+        # upsampling features (6)
+        self.upsample_image = nn.ConvTranspose2d(
+            in_channels=config.hidden_size,
+            out_channels=config.patch_encoder_feature_dims[3],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]),
+            out_channels=config.patch_encoder_feature_dims[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+    def _intermediate0_hook(self, model, input, output):
+        self.intermediate0_hidden_states = output[0]
+
+    def _intermediate1_hook(self, model, input, output):
+        self.intermediate1_hidden_states = output[0]
+
+    def _create_project_upsample_block(
+        self,
+        input_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        intermediate_dims: Optional[int] = None,
+    ) -> nn.Module:
+        
+        intermediate_dims = intermediate_dims or output_dims
+
+        # Projection block followed by upsampling blocks.
+        blocks = [
+            nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False)
+        ] + [
+            nn.ConvTranspose2d(
+                in_channels=(intermediate_dims if i == 0 else output_dims),
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False
+            ) for i in range(n_upsample_layers)
+        ]
+
+        return nn.Sequential(*blocks)
+
+    def _interpolate(self, pixel_values, scale_factor):
+        return nn.functional.interpolate(
+            pixel_values,
+            size=None,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+        )
+
+    def _patch(self, pixel_values, overlap_ratio):
+        patch_size = 384 # TODO: this should be infered
+        patch_stride = int(patch_size * (1 - overlap_ratio))
+
+        image_size = pixel_values.shape[-1]
+        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+
+        x_patch_list = []
+        for j in range(steps):
+            j0 = j * patch_stride
+            j1 = j0 + patch_size
+
+            for i in range(steps):
+                i0 = i * patch_stride
+                i1 = i0 + patch_size
+                x_patch_list.append(pixel_values[..., j0:j1, i0:i1])
+
+        return torch.cat(x_patch_list, dim=0)
+
+    def _reshape_feature(
+        self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
+    ):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        b, hw, c = hidden_states.shape
+
+        # Remove class token.
+        if cls_token_offset > 0:
+            hidden_states = hidden_states[:, cls_token_offset:, :]
+
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
+        return hidden_states
+
+    def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+        """Merge the patched input into a image with sliding window."""
+        steps = int(math.sqrt(x.shape[0] // batch_size))
+
+        idx = 0
+
+        output_list = []
+        for j in range(steps):
+            output_row_list = []
+            for i in range(steps):
+                output = x[batch_size * idx : batch_size * (idx + 1)]
+
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+
+                output_row_list.append(output)
+                idx += 1
+
+            output_row = torch.cat(output_row_list, dim=-1)
+            output_list.append(output_row)
+        output = torch.cat(output_list, dim=-2)
+        return output
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.shape[0]
+
+        # STEP 1: create 3-level image
+
+        high_res = pixel_values
+        med_res = self._interpolate(pixel_values, 0.5)
+        low_res = self._interpolate(pixel_values, 0.25)
+
+        # STEP 2: create patches
+
+        high_res_patches = self._patch(high_res, 0.25)
+        med_res_patches = self._patch(med_res, 0.5)
+        low_res_patches = low_res
+
+        patches = torch.cat(
+            (high_res_patches, med_res_patches, low_res_patches),
+            dim=0,
+        )
+
+        # STEP 3: apply patch encoder
+
+        patch_encodings = self.patch_encoder(
+            patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        patch_features = patch_encodings[0]
+        patch_features = self._reshape_feature(
+            patch_features, self.out_size, self.out_size
+        )
+
+        # STEP 4: Get Intermediate Features (features 1 and 2)
+
+        intermediate0_features = self._reshape_feature(
+            self.intermediate0_hidden_states,
+            self.out_size,
+            self.out_size,
+        )
+        intermediate1_features = self._reshape_feature(
+            self.intermediate1_hidden_states,
+            self.out_size,
+            self.out_size,
+        )
+        intermediate0_features = self._merge(
+            intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        intermediate1_features = self._merge(
+            intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+
+        # STEP 5: Get Patch Encoder Features (features 3-5)
+
+        high_res_features, med_res_features, low_res_features = torch.split(
+            patch_features,
+            [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
+            dim=0,
+        )
+
+        high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
+        med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
+        low_res_features = low_res_features
+
+        # STEP 6: Get Image Encoder Features (features 6)
+
+        image_encodings = self.image_encoder(
+            pixel_values=low_res_patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        image_features = image_encodings[0]
+        image_features = self._reshape_feature(
+            image_features, self.out_size, self.out_size
+        )
+
+        # STEP 7: Upsample All Features (feature 1-6)
+
+        # feature (1-2)
+        intermediate0_features = self.upsample_intermediate0(
+            intermediate0_features
+        )
+        intermediate1_features = self.upsample_intermediate1(
+            intermediate1_features
+        )
+
+        # feature (3-5)
+        high_res_features = self.upsample_high_res(high_res_features)
+        med_res_features = self.upsample_med_res(med_res_features)
+        low_res_features = self.upsample_low_res(low_res_features)
+
+        # feature (6)
+        image_features = self.upsample_image(image_features)
+        image_features = self.fuse_image_with_low_res(
+            torch.cat((low_res_features, image_features), dim=1)
+        )
+
+        last_hidden_state =  [
+            intermediate0_features,
+            intermediate1_features,
+            high_res_features,
+            med_res_features,
+            # low_res_features,
+            image_features, # fused with low_res_features
+        ]
+
+        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None
+        attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None
+
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+class DepthProFOVModel(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
+
+        self.encoder = DepthProViT(config)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.low_res_neck = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True)
+        )
+        self.head = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        low_res_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = nn.functional.interpolate(
+            pixel_values,
+            size=None,
+            scale_factor=0.25,
+            mode="bilinear",
+            align_corners=False,
+        )
+        encoder_outputs = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = encoder_outputs[0]
+
+        image_features = self.encoder_neck(image_features)
+
+        # TODO: add some comments
+        image_features = image_features[:, 1:]
+        image_features = image_features.permute(0, 2, 1)
+
+        low_res_features = self.low_res_neck(low_res_features)
+
+        image_features = image_features.reshape_as(low_res_features)
+        image_features = image_features + low_res_features
+        fov_output = self.head(image_features)
+        fov_output = fov_output.reshape(1)
+
+        if not return_dict:
+            head_outputs = (fov_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=fov_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
+class DepthProResidualLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_decoder
+        self.hidden_size = config.decoder_hidden_size
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            self.hidden_size,
+            self.hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=(not self.use_batch_norm),
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            self.hidden_size,
+            self.hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=(not self.use_batch_norm),
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(self.hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(self.hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+class DepthProFeatureFusionLayer(nn.Module):
+    def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
+        super().__init__()
+        self.config = config
+        self.use_deconv = use_deconv
+
+        self.residual_layer1 = DepthProResidualLayer(config)
+        self.residual_layer2 = DepthProResidualLayer(config)
+
+        if self.use_deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=config.decoder_hidden_size,
+                out_channels=config.decoder_hidden_size,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+
+        self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True)
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
+
+        hidden_state = self.residual_layer2(hidden_state)
+        if self.use_deconv:
+            hidden_state = self.deconv(hidden_state)
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage
+class DepthProDecoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        self.hidden_size = config.decoder_hidden_size
+        self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims
+
+        self.projections = nn.ModuleList()
+        self.fusions = nn.ModuleList()
+        for i, dim in enumerate(self.decoder_feature_dims):
+
+            # Projection
+            if i != 0:
+                # conv for hidden_states[1:]
+                projection = nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=self.hidden_size,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            elif self.hidden_size != dim:
+                # first hidden_state with dim differnet from hidden_size
+                projection = nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=self.hidden_size,
+                    kernel_size=1,
+                    bias=False,
+                )
+            else:
+                # first hidden_state with dim same as hidden_size
+                projection = nn.Identity()
+            self.projections.append(projection)
+
+            # Fusion
+            fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0))
+            self.fusions.append(fusion)
+
+    def forward(self, hidden_states):
+
+        if len(hidden_states) != len(self.decoder_feature_dims):
+            raise ValueError(
+                f"Got number of hidden_states = {len(hidden_states)},"
+                f"expected number of hidden_states = {len(self.decoder_feature_dims)}."
+            )
+
+        # first extract the low_res_features
+        last_features = hidden_states[-1]
+        last_features = self.projections[-1](last_features)
+        low_res_features = last_features # required later for fov_encoder
+        last_features = self.fusions[-1](last_features)
+
+        # now get features through each layer
+        for i in range(len(hidden_states) - 2, -1, -1):
+            hidden_state = hidden_states[i]
+            projection = self.projections[i]
+            fusion = self.fusions[i]
+
+            projected = projection(hidden_state)
+            last_features = fusion(last_features, projected)
+
+        return last_features, low_res_features
+
+
+class DepthProPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DepthProViTSwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.use_fov = config.use_fov
+
+        # dinov2 (vit) like encoder
+        self.encoder = DepthProEncoder(config)
+        # dpt (vit) like decoder
+        self.decoder = DepthProDecoder(config)
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFOVModel(config) if self.use_fov else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        embeddings = {
+            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
+            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
+        }
+        if self.use_fov:
+            embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings
+        return embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # TODO
+    # @add_code_sample_docstrings(
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     modality="vision",
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encodings = self.encoder(
+            pixel_values,
+            head_mask,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+        )
+
+        encodings_last_hidden_state = encodings.last_hidden_state
+
+        for i in range(len(encodings_last_hidden_state)):
+            ic(encodings_last_hidden_state[i].shape)
+
+        features, low_res_features = self.decoder(encodings_last_hidden_state)
+
+        ic(features.shape)
+        ic(low_res_features.shape)
+        # ic(features); exit()
+
+        if self.use_fov:
+            fov_out = self.fov_model(
+                pixel_values=pixel_values,
+                low_res_features=low_res_features.detach(),
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            fov_out = None
+
+        return features, fov_out
+
+
+class DepthProDepthEstimationHead(nn.Module):
+    """
+    # TODO
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        features = config.decoder_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
+            nn.ConvTranspose2d(
+                in_channels=features//2, out_channels=features//2,
+                kernel_size=2, stride=2, padding=0, bias=True
+            ),
+            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+
+
+    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
+        predicted_depth = self.head(hidden_states)
+        predicted_depth = predicted_depth.squeeze(dim=1)
+        return predicted_depth
+
+
+@add_start_docstrings(
+    """
+    DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
+    """,
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProForDepthEstimation(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.depth_pro = DepthProModel(config)
+        self.head = DepthProDepthEstimationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     predicted_depth = outputs.predicted_depth
+
+        >>> # interpolate to original size
+        >>> prediction = torch.nn.functional.interpolate(
+        ...     predicted_depth.unsqueeze(1),
+        ...     size=image.size[::-1],
+        ...     mode="bicubic",
+        ...     align_corners=False,
+        ... )
+
+        >>> # visualize the prediction
+        >>> output = prediction.squeeze().cpu().numpy()
+        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
+        >>> depth = Image.fromarray(formatted)
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = [None] * 4
+
+        hidden_states, fov_out = self.depth_pro(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        predicted_depth = self.head(hidden_states)
+        ic(predicted_depth.shape)
+        ic(fov_out.shape)
+
+        # ic(predicted_depth); exit()
+        ic(fov_out); exit()
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            # hidden_states=outputs.hidden_states,
+            # attentions=outputs.attentions,
+        )

From 1728a2ff687435bc615a8c67d9a4f55baa6ff8d4 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 9 Nov 2024 16:23:06 +0500
Subject: [PATCH 02/72] refactor model architechture

---
 .../depth_pro/configuration_depth_pro.py      |  19 +-
 .../models/depth_pro/modeling_depth_pro.py    | 478 ++++++++++--------
 2 files changed, 288 insertions(+), 209 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index ad0f1016f7a147..7e66e679c67ff1 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -129,9 +129,18 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
+        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+
         patch_encoder_hook_ids = [5, 11],
         # patch_encoder_hook_ids = [5, 11, 17, 23],
-        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+        intermediate_feature_dims = [256, 256],
+        intermediate_upsample_layers = [3, 2],
+        high_res_feature_dims = 512,
+        med_res_feature_dims = 1024,
+        low_res_feature_dims = 1024,
+        image_feature_dims = 1024,
+        global_feature_dims = 1024,
+
         use_batch_norm_in_decoder=False,
         use_fov=False,
         **kwargs,
@@ -165,3 +174,11 @@ def __init__(
         self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov = use_fov
+
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.intermediate_upsample_layers = intermediate_upsample_layers
+        self.high_res_feature_dims = high_res_feature_dims
+        self.med_res_feature_dims = med_res_feature_dims
+        self.low_res_feature_dims = low_res_feature_dims
+        self.image_feature_dims = image_feature_dims
+        self.global_feature_dims = global_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f73b74060f5778..74669bc4e55753 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -568,105 +568,112 @@ def forward(
         )
 
 
+class DepthProUpsampleBlock(nn.Module):
+    def __init__(
+            self,
+            input_dims,
+            intermediate_dims,
+            output_dims,
+            n_upsample_layers,
+            use_proj=True,
+            bias=False,
+        ) -> None:
+        super().__init__()
+
+        # create first projection block
+        if use_proj:
+            self.proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+        else:
+            self.proj = nn.Identity()
+
+        # create following upsample blocks
+        self.upsample_blocks = nn.Sequential()
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.upsample_blocks.append(layer)
+
+    def forward(self, features):
+        projected = self.proj(features)
+        return self.upsample_blocks(projected)
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
-
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
+        self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.intermediate_upsample_layers = config.intermediate_upsample_layers
+ 
         self.out_size = 24 # TODO: image_size // patch_size
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
-        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook(
-            self._intermediate0_hook
-        )
-        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook(
-            self._intermediate1_hook
-        )
 
         # image encoder
         self.image_encoder = DepthProViT(config)
 
-        # upsampling features (1-2)
-        self.upsample_intermediate0 = self._create_project_upsample_block(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.patch_encoder_feature_dims[0],
-            output_dims=config.decoder_hidden_size,
-            n_upsample_layers=3,
-        )
-        self.upsample_intermediate1 = self._create_project_upsample_block(
-            input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[0],
-            n_upsample_layers=2,
-        )
+        # upsampling intermediate features - (1-2) in diagram
+        self.upsample_intermediate = nn.ModuleList()
+        for i, (feature_dims, upsample_layers) in enumerate(zip(
+            self.intermediate_feature_dims,
+            self.intermediate_upsample_layers,
+        )):
+            intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
+            upsample_block = DepthProUpsampleBlock(
+                input_dims=config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=upsample_layers,
+            )
+            self.upsample_intermediate.append(upsample_block)
 
-        # upsampling features (3-5)
-        self.upsample_high_res = self._create_project_upsample_block(
+        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
+        self.upsample_high_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[1],
+            intermediate_dims=config.high_res_feature_dims,
+            output_dims=config.high_res_feature_dims,
             n_upsample_layers=1,
         )
-        self.upsample_med_res = self._create_project_upsample_block(
+        self.upsample_med_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[2],
+            intermediate_dims=config.med_res_feature_dims,
+            output_dims=config.med_res_feature_dims,
             n_upsample_layers=1,
         )
-        self.upsample_low_res = self._create_project_upsample_block(
+        self.upsample_low_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[3],
+            intermediate_dims=config.low_res_feature_dims,
+            output_dims=config.low_res_feature_dims,
             n_upsample_layers=1,
         )
 
-        # upsampling features (6)
-        self.upsample_image = nn.ConvTranspose2d(
-            in_channels=config.hidden_size,
-            out_channels=config.patch_encoder_feature_dims[3],
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            bias=True,
-        )
-        self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]),
-            out_channels=config.patch_encoder_feature_dims[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
+        # upsampling image features - (6) in diagram
+        self.upsample_image = DepthProUpsampleBlock(
+            input_dims=config.hidden_size,
+            intermediate_dims=config.hidden_size,
+            output_dims=config.image_feature_dims,
+            n_upsample_layers=1,
+            use_proj=False,
             bias=True,
         )
 
-    def _intermediate0_hook(self, model, input, output):
-        self.intermediate0_hidden_states = output[0]
-
-    def _intermediate1_hook(self, model, input, output):
-        self.intermediate1_hidden_states = output[0]
-
-    def _create_project_upsample_block(
-        self,
-        input_dims: int,
-        output_dims: int,
-        n_upsample_layers: int,
-        intermediate_dims: Optional[int] = None,
-    ) -> nn.Module:
-        
-        intermediate_dims = intermediate_dims or output_dims
-
-        # Projection block followed by upsampling blocks.
-        blocks = [
-            nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False)
-        ] + [
-            nn.ConvTranspose2d(
-                in_channels=(intermediate_dims if i == 0 else output_dims),
-                out_channels=output_dims,
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=False
-            ) for i in range(n_upsample_layers)
-        ]
-
-        return nn.Sequential(*blocks)
-
     def _interpolate(self, pixel_values, scale_factor):
         return nn.functional.interpolate(
             pixel_values,
@@ -771,97 +778,100 @@ def forward(
             dim=0,
         )
 
-        # STEP 3: apply patch encoder
+        # STEP 3: apply patch and image encoder
 
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_hidden_states=True, # required for intermediate features
             return_dict=True,
         )
-        patch_features = patch_encodings[0]
-        patch_features = self._reshape_feature(
-            patch_features, self.out_size, self.out_size
+        image_encodings = self.image_encoder(
+            pixel_values=low_res_patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
         )
 
-        # STEP 4: Get Intermediate Features (features 1 and 2)
-
-        intermediate0_features = self._reshape_feature(
-            self.intermediate0_hidden_states,
-            self.out_size,
-            self.out_size,
-        )
-        intermediate1_features = self._reshape_feature(
-            self.intermediate1_hidden_states,
-            self.out_size,
-            self.out_size,
-        )
-        intermediate0_features = self._merge(
-            intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
-        )
-        intermediate1_features = self._merge(
-            intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
-        )
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
-        # STEP 5: Get Patch Encoder Features (features 3-5)
+        # a. extract hidden_state
+        hidden_state = patch_encodings.last_hidden_state
 
+        # b. reshape back to image like
+        features = self._reshape_feature(
+            hidden_state, self.out_size, self.out_size
+        )
         high_res_features, med_res_features, low_res_features = torch.split(
-            patch_features,
+            features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
         )
 
+        # c. merge patches back together
         high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
         med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
-        low_res_features = low_res_features
+        low_res_features = low_res_features # no merge required with low res image
 
-        # STEP 6: Get Image Encoder Features (features 6)
+        # d. upsample
+        high_res_features = self.upsample_high_res(high_res_features)
+        med_res_features = self.upsample_med_res(med_res_features)
+        low_res_features = self.upsample_low_res(low_res_features)
 
-        image_encodings = self.image_encoder(
-            pixel_values=low_res_patches,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-        )
-        image_features = image_encodings[0]
-        image_features = self._reshape_feature(
-            image_features, self.out_size, self.out_size
-        )
+        # STEP 5: get intermediate features - (1-2) in diagram
 
-        # STEP 7: Upsample All Features (feature 1-6)
+        intermediate_features = []
+        for layer_id in self.patch_encoder_hook_ids:
+            
+            # a. extract hidden_state
+            hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
 
-        # feature (1-2)
-        intermediate0_features = self.upsample_intermediate0(
-            intermediate0_features
-        )
-        intermediate1_features = self.upsample_intermediate1(
-            intermediate1_features
+            # b. reshape back to image like
+            features = self._reshape_feature(
+                hidden_state,
+                self.out_size,
+                self.out_size,
+            )
+
+            # c. merge patches back together
+            features = self._merge(
+                features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+            )
+
+            # d. upsample
+            features = self.upsample_intermediate[layer_id](features)
+
+            intermediate_features.append(features)
+
+        # STEP 6: get image features - (6) in diagram
+
+        # a. extract hidden_state
+        hidden_state = image_encodings.last_hidden_state
+
+        # b. reshape back to image like
+        image_features = self._reshape_feature(
+            hidden_state, self.out_size, self.out_size
         )
 
-        # feature (3-5)
-        high_res_features = self.upsample_high_res(high_res_features)
-        med_res_features = self.upsample_med_res(med_res_features)
-        low_res_features = self.upsample_low_res(low_res_features)
+        # c. merge patches back together
+        # skipped, no merge required with low res image
 
-        # feature (6)
+        # d. upsample
         image_features = self.upsample_image(image_features)
-        image_features = self.fuse_image_with_low_res(
-            torch.cat((low_res_features, image_features), dim=1)
-        )
 
+        # STEP 7: return these features
         last_hidden_state =  [
-            intermediate0_features,
-            intermediate1_features,
+            *intermediate_features,
             high_res_features,
             med_res_features,
-            # low_res_features,
-            image_features, # fused with low_res_features
+            low_res_features,
+            image_features,
         ]
 
-        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None
-        attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None
+        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
@@ -882,7 +892,7 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.encoder = DepthProViT(config)
         self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
-        self.low_res_neck = nn.Sequential(
+        self.global_neck = nn.Sequential(
             nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
@@ -897,7 +907,7 @@ def __init__(self, config: DepthProConfig) -> None:
     def forward(
         self,
         pixel_values: torch.Tensor,
-        low_res_features: torch.Tensor,
+        global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -923,19 +933,19 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        image_features = encoder_outputs[0]
+        last_hidden_state = encoder_outputs[0]
 
-        image_features = self.encoder_neck(image_features)
+        last_hidden_state = self.encoder_neck(last_hidden_state)
 
         # TODO: add some comments
-        image_features = image_features[:, 1:]
-        image_features = image_features.permute(0, 2, 1)
+        last_hidden_state = last_hidden_state[:, 1:]
+        last_hidden_state = last_hidden_state.permute(0, 2, 1)
 
-        low_res_features = self.low_res_neck(low_res_features)
+        global_features = self.global_neck(global_features)
 
-        image_features = image_features.reshape_as(low_res_features)
-        image_features = image_features + low_res_features
-        fov_output = self.head(image_features)
+        last_hidden_state = last_hidden_state.reshape_as(global_features)
+        last_hidden_state = last_hidden_state + global_features
+        fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(1)
 
         if not return_dict:
@@ -1040,65 +1050,126 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
 
-        self.hidden_size = config.decoder_hidden_size
-        self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims
-
-        self.projections = nn.ModuleList()
-        self.fusions = nn.ModuleList()
-        for i, dim in enumerate(self.decoder_feature_dims):
-
-            # Projection
-            if i != 0:
-                # conv for hidden_states[1:]
-                projection = nn.Conv2d(
-                    in_channels=dim,
-                    out_channels=self.hidden_size,
+        # for STEP 2: fuse low_res and image features
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.low_res_feature_dims+config.image_feature_dims,
+            out_channels=config.global_feature_dims,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+        # for STEP 3: apply decoder block for global features
+        self.global_proj = nn.Conv2d(
+            in_channels=config.global_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.global_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 4: apply decoder block for med features
+        self.med_res_proj = nn.Conv2d(
+            in_channels=config.med_res_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.med_res_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 5: apply decoder block for high features
+        self.high_res_proj = nn.Conv2d(
+            in_channels=config.high_res_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.high_res_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 6: apply decoder block for intermediate features
+        self.intermediate_proj = nn.Sequential()
+        self.intermediate_fusion = nn.Sequential()
+        for i, feature_dim in enumerate(config.intermediate_feature_dims):
+            if i == 0:
+                # no projection for final intermediate layer
+                proj = nn.Identity()
+                fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
+            else:
+                proj = nn.Conv2d(
+                    in_channels=feature_dim,
+                    out_channels=config.decoder_hidden_size,
                     kernel_size=3,
                     stride=1,
                     padding=1,
                     bias=False,
                 )
-            elif self.hidden_size != dim:
-                # first hidden_state with dim differnet from hidden_size
-                projection = nn.Conv2d(
-                    in_channels=dim,
-                    out_channels=self.hidden_size,
-                    kernel_size=1,
-                    bias=False,
-                )
-            else:
-                # first hidden_state with dim same as hidden_size
-                projection = nn.Identity()
-            self.projections.append(projection)
+                fusion = DepthProFeatureFusionLayer(config)
 
-            # Fusion
-            fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0))
-            self.fusions.append(fusion)
+            self.intermediate_proj.append(proj)
+            self.intermediate_fusion.append(fusion)
 
     def forward(self, hidden_states):
 
-        if len(hidden_states) != len(self.decoder_feature_dims):
-            raise ValueError(
-                f"Got number of hidden_states = {len(hidden_states)},"
-                f"expected number of hidden_states = {len(self.decoder_feature_dims)}."
-            )
+        # STEP 1: extract features
 
-        # first extract the low_res_features
-        last_features = hidden_states[-1]
-        last_features = self.projections[-1](last_features)
-        low_res_features = last_features # required later for fov_encoder
-        last_features = self.fusions[-1](last_features)
+        intermediate_features = hidden_states[:-4]
+        # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384]
+        high_res_features = hidden_states[-4]
+        # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192]
+        med_res_features = hidden_states[-3]
+        # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96]
+        low_res_features = hidden_states[-2]
+        # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48]
+        image_features = hidden_states[-1]
+        # image_features.shape: [batch_size, config.image_feature_dims, 48, 48]
 
-        # now get features through each layer
-        for i in range(len(hidden_states) - 2, -1, -1):
-            hidden_state = hidden_states[i]
-            projection = self.projections[i]
-            fusion = self.fusions[i]
+        # STEP 2: fuse low_res and image features
 
-            projected = projection(hidden_state)
-            last_features = fusion(last_features, projected)
+        global_features = torch.cat((low_res_features, image_features), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        # global_features.shape: [batch_size, config.global_feature_dims, 48, 48]
 
-        return last_features, low_res_features
+        # STEP 3: apply decoder block for global features
+
+        # apply projection: used by fusion now and then fov later
+        global_projected = self.global_proj(global_features)
+        # apply fusion: used by next projections and fusions
+        last_features = self.global_fusion(global_projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96]
+
+        # STEP 4: apply decoder block for med features
+
+        projected = self.med_res_proj(med_res_features)
+        last_features = self.med_res_fusion(last_features, projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192]
+
+        # STEP 5: apply decoder block for high features
+
+        projected = self.high_res_proj(high_res_features)
+        last_features = self.high_res_fusion(last_features, projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384]
+
+        # STEP 6: apply decoder block for intermediate features
+
+        for (features, proj_layer, fusion_layer) in zip(
+            # reversed becuase decoding is applied from last features to first features
+            intermediate_features[::-1],
+            self.intermediate_proj[::-1],
+            self.intermediate_fusion[::-1],
+        ):
+            projected = proj_layer(features)
+            last_features = fusion_layer(last_features, projected)
+            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
+            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
+
+        return last_features, global_projected
 
 
 class DepthProPreTrainedModel(PreTrainedModel):
@@ -1233,26 +1304,18 @@ def forward(
         encodings = self.encoder(
             pixel_values,
             head_mask,
-            output_attentions,
-            output_hidden_states,
-            return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        encodings_last_hidden_state = encodings.last_hidden_state
-
-        for i in range(len(encodings_last_hidden_state)):
-            ic(encodings_last_hidden_state[i].shape)
-
-        features, low_res_features = self.decoder(encodings_last_hidden_state)
-
-        ic(features.shape)
-        ic(low_res_features.shape)
-        # ic(features); exit()
+        last_hidden_state = encodings[0]
+        last_hidden_state, global_features = self.decoder(last_hidden_state)
 
         if self.use_fov:
             fov_out = self.fov_model(
                 pixel_values=pixel_values,
-                low_res_features=low_res_features.detach(),
+                global_features=global_features.detach(),
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -1261,7 +1324,8 @@ def forward(
         else:
             fov_out = None
 
-        return features, fov_out
+        # TODO: return all hidden_states
+        return last_hidden_state, fov_out
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1375,18 +1439,16 @@ def forward(
 
         outputs = [None] * 4
 
-        hidden_states, fov_out = self.depth_pro(
+        last_hidden_state, fov_out = self.depth_pro(
             pixel_values=pixel_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        predicted_depth = self.head(hidden_states)
-        ic(predicted_depth.shape)
-        ic(fov_out.shape)
+        predicted_depth = self.head(last_hidden_state)
 
-        # ic(predicted_depth); exit()
+        ic(predicted_depth)
         ic(fov_out); exit()
 
         if not return_dict:

From 11ce50c5cf2c87839909da806b1a9dc1665c11f2 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 12 Nov 2024 10:49:46 +0500
Subject: [PATCH 03/72] update model outputs

---
 .../models/depth_pro/modeling_depth_pro.py    | 77 ++++++++++++++-----
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 74669bc4e55753..daa2bbbdd64ba8 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -24,9 +24,10 @@
 from torch import nn
 from dataclasses import dataclass
 
+from ...utils import ModelOutput
 from ...activations import ACT2FN
 from ...modeling_outputs import (
-    BaseModelOutput,
+    BaseModelOutput, DepthEstimatorOutput
 )
 from ...utils import (
     add_code_sample_docstrings,
@@ -1232,6 +1233,18 @@ def _init_weights(self, module):
 """
 
 
+@dataclass
+class DepthProModelOutput(BaseModelOutput):
+    """
+    Base class for model's outputs, with potential fov, hidden states and attentions.
+
+    Args:
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+            Field of View Scaler.
+    """
+    fov: Optional[torch.FloatTensor] = None
+
+
 @add_start_docstrings(
     "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
     DEPTH_PRO_START_DOCSTRING,
@@ -1306,14 +1319,14 @@ def forward(
             head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
-        last_hidden_state = encodings[0]
+        last_hidden_state = encodings.last_hidden_state
         last_hidden_state, global_features = self.decoder(last_hidden_state)
 
         if self.use_fov:
-            fov_out = self.fov_model(
+            fov_encodings = self.fov_model(
                 pixel_values=pixel_values,
                 global_features=global_features.detach(),
                 head_mask=head_mask,
@@ -1321,11 +1334,24 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
+            fov = fov_encodings.last_hidden_state
         else:
-            fov_out = None
+            fov = None
+
+        attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
+        hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+
+        if not return_dict:
+            outputs = (last_hidden_state, fov, hidden_states, attentions)
+            outputs = (i for i in outputs if i is not None)
+            return outputs
 
-        # TODO: return all hidden_states
-        return last_hidden_state, fov_out
+        return DepthProModelOutput(
+            last_hidden_state=last_hidden_state,
+            fov=fov,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1360,6 +1386,18 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         return predicted_depth
 
 
+@dataclass
+class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
+    """
+    Base class for outputs of DepthProDepthEstimator.
+
+    Args:
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+            Field of View Scaler.
+    """
+    fov: Optional[torch.FloatTensor] = None
+
+
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
@@ -1436,31 +1474,28 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # use_fov = use_fov if use_fov is not None else self.config.use_fov
 
-        outputs = [None] * 4
-
-        last_hidden_state, fov_out = self.depth_pro(
+        depth_pro_outputs = self.depth_pro(
             pixel_values=pixel_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
+        last_hidden_state = depth_pro_outputs[0]
         predicted_depth = self.head(last_hidden_state)
 
-        ic(predicted_depth)
-        ic(fov_out); exit()
-
         if not return_dict:
-            if output_hidden_states:
-                output = (predicted_depth,) + outputs[1:]
+            if loss is None:
+                return (predicted_depth,) + depth_pro_outputs[1:]
             else:
-                output = (predicted_depth,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
+                return (loss, predicted_depth) + depth_pro_outputs[1:]
 
-        return DepthEstimatorOutput(
+        return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
-            # hidden_states=outputs.hidden_states,
-            # attentions=outputs.attentions,
+            fov=depth_pro_outputs.fov,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
         )

From 27e9593ada48c5c17a3a96e67bff534e022359ad Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:23:03 +0500
Subject: [PATCH 04/72] update init param to include use_fov_model

---
 .../models/depth_pro/modeling_depth_pro.py     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index daa2bbbdd64ba8..f8b69bfec86eb6 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1239,7 +1239,7 @@ class DepthProModelOutput(BaseModelOutput):
     Base class for model's outputs, with potential fov, hidden states and attentions.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
     fov: Optional[torch.FloatTensor] = None
@@ -1250,17 +1250,17 @@ class DepthProModelOutput(BaseModelOutput):
     DEPTH_PRO_START_DOCSTRING,
 )
 class DepthProModel(DepthProPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, use_fov_model=None):
         super().__init__(config)
         self.config = config
-        self.use_fov = config.use_fov
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
         # dinov2 (vit) like encoder
         self.encoder = DepthProEncoder(config)
         # dpt (vit) like decoder
         self.decoder = DepthProDecoder(config)
         # dinov2 (vit) like encoder
-        self.fov_model = DepthProFOVModel(config) if self.use_fov else None
+        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1325,7 +1325,7 @@ def forward(
         last_hidden_state = encodings.last_hidden_state
         last_hidden_state, global_features = self.decoder(last_hidden_state)
 
-        if self.use_fov:
+        if self.use_fov_model:
             fov_encodings = self.fov_model(
                 pixel_values=pixel_values,
                 global_features=global_features.detach(),
@@ -1392,7 +1392,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
     Base class for outputs of DepthProDepthEstimator.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
     fov: Optional[torch.FloatTensor] = None
@@ -1405,10 +1405,11 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
     DEPTH_PRO_START_DOCSTRING,
 )
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, use_fov_model=None):
         super().__init__(config)
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
-        self.depth_pro = DepthProModel(config)
+        self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model)
         self.head = DepthProDepthEstimationHead(config)
 
         # Initialize weights and apply final processing
@@ -1474,7 +1475,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        # use_fov = use_fov if use_fov is not None else self.config.use_fov
 
         depth_pro_outputs = self.depth_pro(
             pixel_values=pixel_values,

From e74a7f505f91a24117e7838e367b72a50ff9e8f1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:24:21 +0500
Subject: [PATCH 05/72] update param name in config

---
 src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 7e66e679c67ff1..a4037c99ee0fc0 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -142,7 +142,7 @@ def __init__(
         global_feature_dims = 1024,
 
         use_batch_norm_in_decoder=False,
-        use_fov=False,
+        use_fov_model=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -173,7 +173,7 @@ def __init__(
         self.patch_encoder_hook_ids = patch_encoder_hook_ids
         self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
-        self.use_fov = use_fov
+        self.use_fov_model = use_fov_model
 
         self.intermediate_feature_dims = intermediate_feature_dims
         self.intermediate_upsample_layers = intermediate_upsample_layers

From 8c2460b0655dd3ef698b765eb64c79cc785c7d10 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:51:56 +0500
Subject: [PATCH 06/72] fix hidden_states and attentions outputs for fov

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f8b69bfec86eb6..620133771c0674 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1332,14 +1332,15 @@ def forward(
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,
             )
             fov = fov_encodings.last_hidden_state
+            attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
+            hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
         else:
             fov = None
-
-        attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
-        hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            attentions = encodings.attentions
+            hidden_states = encodings.hidden_states
 
         if not return_dict:
             outputs = (last_hidden_state, fov, hidden_states, attentions)

From 55f6ed3439cef2a731b8b78cba3b6142e3125447 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 11:20:56 +0500
Subject: [PATCH 07/72] sort config

---
 .../models/depth_pro/configuration_depth_pro.py             | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index a4037c99ee0fc0..16ff55e9cb6c94 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -129,10 +129,7 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        patch_encoder_feature_dims = [256, 512, 1024, 1024],
-
         patch_encoder_hook_ids = [5, 11],
-        # patch_encoder_hook_ids = [5, 11, 17, 23],
         intermediate_feature_dims = [256, 256],
         intermediate_upsample_layers = [3, 2],
         high_res_feature_dims = 512,
@@ -140,7 +137,6 @@ def __init__(
         low_res_feature_dims = 1024,
         image_feature_dims = 1024,
         global_feature_dims = 1024,
-
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
         **kwargs,
@@ -171,10 +167,8 @@ def __init__(
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
         self.patch_encoder_hook_ids = patch_encoder_hook_ids
-        self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
-
         self.intermediate_feature_dims = intermediate_feature_dims
         self.intermediate_upsample_layers = intermediate_upsample_layers
         self.high_res_feature_dims = high_res_feature_dims

From b25dffb5d7f0aef86bb7c2dac990c24b28dafb5a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 11:21:13 +0500
Subject: [PATCH 08/72] complete minor todos

---
 .../models/depth_pro/modeling_depth_pro.py            | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 620133771c0674..956fe7afb7f7b9 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -938,8 +938,7 @@ def forward(
 
         last_hidden_state = self.encoder_neck(last_hidden_state)
 
-        # TODO: add some comments
-        last_hidden_state = last_hidden_state[:, 1:]
+        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
         last_hidden_state = last_hidden_state.permute(0, 2, 1)
 
         global_features = self.global_neck(global_features)
@@ -1357,10 +1356,10 @@ def forward(
 
 class DepthProDepthEstimationHead(nn.Module):
     """
-    # TODO
-    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
-    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
-    supplementary material).
+    The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
+    This module comprises a sequence of convolutional and transposed convolutional layers
+    that process the feature map from the decoder to produce a single-channel depth map.
+    Key operations include dimensionality reduction and upsampling to match the input resolution.
     """
 
     def __init__(self, config):

From c225deb0d126a8420ccb5e381fa2e120abedabf0 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 13:22:15 +0500
Subject: [PATCH 09/72] update patching

---
 .../models/depth_pro/modeling_depth_pro.py    | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 956fe7afb7f7b9..59b6d46e30cae2 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -685,23 +685,25 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        patch_size = 384 # TODO: this should be infered
-        patch_stride = int(patch_size * (1 - overlap_ratio))
+        B, C, H, W = pixel_values.shape
 
-        image_size = pixel_values.shape[-1]
-        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+        patch_size = 384  # TODO: this should be inferred
+        stride = int(patch_size * (1 - overlap_ratio))
 
-        x_patch_list = []
-        for j in range(steps):
-            j0 = j * patch_stride
-            j1 = j0 + patch_size
+        if pixel_values.dim() != 4:
+            raise ValueError("Input tensor must have shape (B, C, H, W).")
 
-            for i in range(steps):
-                i0 = i * patch_stride
-                i1 = i0 + patch_size
-                x_patch_list.append(pixel_values[..., j0:j1, i0:i1])
+        # pixel_values.shape (B, C, H, W)
+        patches = torch.nn.functional.unfold(
+            pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
+        )
+        # patches.shape (B, -1, num_patches)
+        patches = patches.permute(2, 0, 1)
+        # patches.shape (num_patches, B, -1)
+        patches = patches.reshape(-1, C, patch_size, patch_size)
+        # patches.shape (B * num_patches, C, patch_size, patch_size)
 
-        return torch.cat(x_patch_list, dim=0)
+        return patches
 
     def _reshape_feature(
         self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
@@ -760,7 +762,7 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        batch_size = pixel_values.shape[0]
+        B, C, H, W = pixel_values.shape
 
         # STEP 1: create 3-level image
 
@@ -812,8 +814,8 @@ def forward(
         )
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
-        med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
+        high_res_features = self._merge(high_res_features, batch_size=B, padding=3)
+        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)
         low_res_features = low_res_features # no merge required with low res image
 
         # d. upsample
@@ -838,7 +840,7 @@ def forward(
 
             # c. merge patches back together
             features = self._merge(
-                features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+                features[: B * 5 * 5], batch_size=B, padding=3
             )
 
             # d. upsample

From 176932dc6aba7bfaf541bee756fc493f541434dd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 16:35:43 +0500
Subject: [PATCH 10/72] update config for encoder

---
 .../depth_pro/configuration_depth_pro.py      |  14 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 108 ++++++++++--------
 2 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 16ff55e9cb6c94..cdf3cf4d8d7077 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -119,7 +119,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-6,
         image_size=384,
-        patch_size=16, # changed
+        patch_size=16, # TODO remove this
         num_channels=3,
         qkv_bias=True,
         layerscale_value=1.0,
@@ -139,6 +139,13 @@ def __init__(
         global_feature_dims = 1024,
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
+
+        # aux_image_size=1536,
+        # aux_patch_size=384,
+        aux_image_size=1536 // 2,
+        aux_patch_size=384 // 2,
+        aux_num_channels=3,
+        patch_embeddings_size=16,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -176,3 +183,8 @@ def __init__(
         self.low_res_feature_dims = low_res_feature_dims
         self.image_feature_dims = image_feature_dims
         self.global_feature_dims = global_feature_dims
+
+        self.aux_image_size = aux_image_size
+        self.aux_patch_size = aux_patch_size
+        self.aux_num_channels = aux_num_channels
+        self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 59b6d46e30cae2..3d3d356cc0eeb2 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -55,22 +55,22 @@ class DepthProViTPatchEmbeddings(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.hidden_size
 
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+        self.config = config
+        self.in_channels = config.aux_num_channels
+        self.out_channels = config.hidden_size
+        self.patch_embeddings_size = config.patch_embeddings_size
+
+        self.projection = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=(self.patch_embeddings_size, self.patch_embeddings_size),
+            stride=(self.patch_embeddings_size, self.patch_embeddings_size),
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.num_channels:
+        if num_channels != self.config.aux_num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -89,10 +89,12 @@ class DepthProViTEmbeddings(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
+        self.config = config
+        self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2
+
         self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.patch_size = config.patch_size
         self.config = config
@@ -107,11 +109,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_patches = embeddings.shape[1] - 1
         num_positions = self.position_embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
-        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+        if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
             return self.position_embeddings
 
         class_pos_embed = self.position_embeddings[:, :1]
@@ -119,8 +120,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         dim = embeddings.shape[-1]
 
-        new_height = height // self.patch_size
-        new_width = width // self.patch_size
+        new_height = height // self.patch_size # TODO: check this
+        new_width = width // self.patch_size # TODO: check this
 
         sqrt_num_positions = torch_int(num_positions**0.5)
         patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
@@ -621,8 +622,9 @@ def __init__(self, config: DepthProConfig) -> None:
         self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
         self.intermediate_upsample_layers = config.intermediate_upsample_layers
- 
-        self.out_size = 24 # TODO: image_size // patch_size
+
+        self.out_size = config.aux_patch_size // config.patch_embeddings_size
+        self.seq_len = self.out_size ** 2
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
@@ -685,23 +687,18 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        B, C, H, W = pixel_values.shape
-
-        patch_size = 384  # TODO: this should be inferred
+        patch_size = self.config.aux_patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
-        if pixel_values.dim() != 4:
-            raise ValueError("Input tensor must have shape (B, C, H, W).")
-
-        # pixel_values.shape (B, C, H, W)
+        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
         patches = torch.nn.functional.unfold(
             pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
         )
         # patches.shape (B, -1, num_patches)
         patches = patches.permute(2, 0, 1)
         # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, C, patch_size, patch_size)
-        # patches.shape (B * num_patches, C, patch_size, patch_size)
+        patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size)
+        # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         return patches
 
@@ -762,24 +759,33 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if pixel_values.dim() != 4:
+            raise ValueError("Input tensor must have shape (B, C, H, W).")
+
         B, C, H, W = pixel_values.shape
 
+        # TODO validate: H = W = aux_image_size
+        # TODO validate: C = aux_num_channels
+        # TODO validate: aux_image_size = aux_patch_size * 4
+
+        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+
         # STEP 1: create 3-level image
 
-        high_res = pixel_values
-        med_res = self._interpolate(pixel_values, 0.5)
-        low_res = self._interpolate(pixel_values, 0.25)
+        high_res = pixel_values                         # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2)
+        low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)
-        med_res_patches = self._patch(med_res, 0.5)
-        low_res_patches = low_res
+        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        low_res_patches = low_res                       # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         patches = torch.cat(
             (high_res_patches, med_res_patches, low_res_patches),
             dim=0,
-        )
+        ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -801,42 +807,43 @@ def forward(
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
         # a. extract hidden_state
-        hidden_state = patch_encodings.last_hidden_state
+        hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )
+        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
         high_res_features, med_res_features, low_res_features = torch.split(
             features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
-        )
+        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, padding=3)
-        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)
-        low_res_features = low_res_features # no merge required with low res image
+        high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)
+        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)   # (B, config.hidden_size, ~, ~)
+        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, ~, ~)
 
         # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)
-        med_res_features = self.upsample_med_res(med_res_features)
-        low_res_features = self.upsample_low_res(low_res_features)
+        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, ~, ~)
+        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, ~, ~)
+        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, ~, ~)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
         for layer_id in self.patch_encoder_hook_ids:
-            
+
             # a. extract hidden_state
             hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
+            # (num_patches, self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
             features = self._reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            )
+            ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = self._merge(
@@ -845,24 +852,25 @@ def forward(
 
             # d. upsample
             features = self.upsample_intermediate[layer_id](features)
+            # (B, config.intermediate_feature_dims[layer_id], ~, ~)
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state
+        hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )
+        ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
         # skipped, no merge required with low res image
 
         # d. upsample
-        image_features = self.upsample_image(image_features)
+        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~)
 
         # STEP 7: return these features
         last_hidden_state =  [

From dcec5228b21352f6638c27c91f1d4056323eba95 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 16:46:17 +0500
Subject: [PATCH 11/72] fix config

---
 .../depth_pro/configuration_depth_pro.py      | 20 +++-----
 .../models/depth_pro/modeling_depth_pro.py    | 48 +++++++++----------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index cdf3cf4d8d7077..fc12b37b19d073 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -118,9 +118,12 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=384,
-        patch_size=16, # TODO remove this
+        # image_size=1536,
+        # patch_size=384,
+        image_size=1536 // 2,
+        patch_size=384 // 2,
         num_channels=3,
+        patch_embeddings_size=16,
         qkv_bias=True,
         layerscale_value=1.0,
         drop_path_rate=0.0,
@@ -139,13 +142,6 @@ def __init__(
         global_feature_dims = 1024,
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
-
-        # aux_image_size=1536,
-        # aux_patch_size=384,
-        aux_image_size=1536 // 2,
-        aux_patch_size=384 // 2,
-        aux_num_channels=3,
-        patch_embeddings_size=16,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -163,6 +159,7 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.patch_embeddings_size = patch_embeddings_size
         self.qkv_bias = qkv_bias
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
@@ -183,8 +180,3 @@ def __init__(
         self.low_res_feature_dims = low_res_feature_dims
         self.image_feature_dims = image_feature_dims
         self.global_feature_dims = global_feature_dims
-
-        self.aux_image_size = aux_image_size
-        self.aux_patch_size = aux_patch_size
-        self.aux_num_channels = aux_num_channels
-        self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3d3d356cc0eeb2..d5639131397923 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -57,7 +57,7 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        self.in_channels = config.aux_num_channels
+        self.in_channels = config.num_channels
         self.out_channels = config.hidden_size
         self.patch_embeddings_size = config.patch_embeddings_size
 
@@ -70,7 +70,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.config.aux_num_channels:
+        if num_channels != self.config.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -90,14 +90,12 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
         self.config = config
-        self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2
+        self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2
 
         self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
         self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.patch_size = config.patch_size
-        self.config = config
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
@@ -120,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         dim = embeddings.shape[-1]
 
-        new_height = height // self.patch_size # TODO: check this
-        new_width = width // self.patch_size # TODO: check this
+        new_height = height // self.config.patch_embeddings_size
+        new_width = width // self.config.patch_embeddings_size
 
         sqrt_num_positions = torch_int(num_positions**0.5)
         patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
@@ -623,7 +621,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.intermediate_feature_dims = config.intermediate_feature_dims
         self.intermediate_upsample_layers = config.intermediate_upsample_layers
 
-        self.out_size = config.aux_patch_size // config.patch_embeddings_size
+        self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size ** 2
 
         # patch encoder
@@ -687,18 +685,18 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        patch_size = self.config.aux_patch_size
+        patch_size = self.config.patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
-        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
         patches = torch.nn.functional.unfold(
             pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
         )
         # patches.shape (B, -1, num_patches)
         patches = patches.permute(2, 0, 1)
         # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size)
-        # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size)
+        # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size)
 
         return patches
 
@@ -764,28 +762,28 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        # TODO validate: H = W = aux_image_size
-        # TODO validate: C = aux_num_channels
-        # TODO validate: aux_image_size = aux_patch_size * 4
+        # TODO validate: H = W = image_size
+        # TODO validate: C = num_channels
+        # TODO validate: image_size = patch_size * 4
 
-        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
 
         # STEP 1: create 3-level image
 
-        high_res = pixel_values                         # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
-        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2)
-        low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4)
+        high_res = pixel_values                         # (B, config.num_channels, config.image_size, config.image_size)
+        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.num_channels, config.image_size//2, config.image_size//2)
+        low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
-        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
-        low_res_patches = low_res                       # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.num_channels, config.patch_size, config.patch_size)
+        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.num_channels, config.patch_size, config.patch_size)
+        low_res_patches = low_res                       # (-1, config.num_channels, config.patch_size, config.patch_size)
 
         patches = torch.cat(
             (high_res_patches, med_res_patches, low_res_patches),
             dim=0,
-        ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        ) # (num_patches, config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -812,12 +810,12 @@ def forward(
         # b. reshape back to image like
         features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
+        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
         high_res_features, med_res_features, low_res_features = torch.split(
             features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
-        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
+        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
         high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)

From 0384d2f189062259b3b99a3d692593e28902ec0b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 19:37:00 +0500
Subject: [PATCH 12/72] use correct defaults in config

---
 .../models/depth_pro/configuration_depth_pro.py             | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index fc12b37b19d073..aff3eb3e2941ac 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -118,10 +118,8 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        # image_size=1536,
-        # patch_size=384,
-        image_size=1536 // 2,
-        patch_size=384 // 2,
+        image_size=1536,
+        patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,
         qkv_bias=True,

From 85e4f868b65fa5b208883cb973824ca6e2557db8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 17 Nov 2024 23:47:50 +0500
Subject: [PATCH 13/72] update merge for compatibility with different image
 size

---
 .../depth_pro/configuration_depth_pro.py      |   6 +-
 .../models/depth_pro/modeling_depth_pro.py    | 135 +++++++++++-------
 2 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index aff3eb3e2941ac..d9f973639ad0fd 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -108,9 +108,9 @@ class DepthProConfig(PretrainedConfig):
 
     def __init__(
         self,
-        hidden_size=1024, # changed
+        hidden_size=1024,
         decoder_hidden_size=256,
-        num_hidden_layers=24, # changed
+        num_hidden_layers=24,
         num_attention_heads=16,
         mlp_ratio=4,
         hidden_act="gelu",
@@ -132,7 +132,6 @@ def __init__(
         reshape_hidden_states=True,
         patch_encoder_hook_ids = [5, 11],
         intermediate_feature_dims = [256, 256],
-        intermediate_upsample_layers = [3, 2],
         high_res_feature_dims = 512,
         med_res_feature_dims = 1024,
         low_res_feature_dims = 1024,
@@ -172,7 +171,6 @@ def __init__(
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
         self.intermediate_feature_dims = intermediate_feature_dims
-        self.intermediate_upsample_layers = intermediate_upsample_layers
         self.high_res_feature_dims = high_res_feature_dims
         self.med_res_feature_dims = med_res_feature_dims
         self.low_res_feature_dims = low_res_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index d5639131397923..316afe444fbb62 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -619,7 +619,6 @@ def __init__(self, config: DepthProConfig) -> None:
         self.decoder_hidden_size = config.decoder_hidden_size
         self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
-        self.intermediate_upsample_layers = config.intermediate_upsample_layers
 
         self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size ** 2
@@ -632,17 +631,15 @@ def __init__(self, config: DepthProConfig) -> None:
 
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
-        for i, (feature_dims, upsample_layers) in enumerate(zip(
-            self.intermediate_feature_dims,
-            self.intermediate_upsample_layers,
-        )):
+        for i, feature_dims in enumerate(self.intermediate_feature_dims):
             intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
             upsample_block = DepthProUpsampleBlock(
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=upsample_layers,
+                n_upsample_layers=1+len(self.intermediate_feature_dims)-i,
             )
+
             self.upsample_intermediate.append(upsample_block)
 
         # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
@@ -714,34 +711,46 @@ def _reshape_feature(
         hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
         return hidden_states
 
-    def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+    def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
         """Merge the patched input into a image with sliding window."""
-        steps = int(math.sqrt(x.shape[0] // batch_size))
-
-        idx = 0
-
-        output_list = []
-        for j in range(steps):
-            output_row_list = []
-            for i in range(steps):
-                output = x[batch_size * idx : batch_size * (idx + 1)]
+        # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
+        box_size = int(math.sqrt(x.shape[0] // batch_size))
 
-                if j != 0:
-                    output = output[..., padding:, :]
-                if i != 0:
-                    output = output[..., :, padding:]
-                if j != steps - 1:
-                    output = output[..., :-padding, :]
-                if i != steps - 1:
-                    output = output[..., :, :-padding]
-
-                output_row_list.append(output)
-                idx += 1
-
-            output_row = torch.cat(output_row_list, dim=-1)
-            output_list.append(output_row)
-        output = torch.cat(output_list, dim=-2)
-        return output
+        """
+        merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+        padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
+        """
+        padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 )
+
+        i = 0
+        boxes = []
+        for h in range(box_size):
+            boxes_in_row = []
+            for w in range(box_size):
+                box = x[batch_size * i : batch_size * (i + 1)]
+
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    box = box[..., padding:, :]
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    box = box[..., :, padding:]
+                if h != box_size - 1:
+                    # remove pad from height if box is not at bottom border
+                    box = box[..., :box.shape[-2]-padding, :]
+                if w != box_size - 1:
+                    # remove pad from width if box is not at right border
+                    box = box[..., :, :box.shape[-1]-padding]
+
+                boxes_in_row.append(box)
+                i += 1
+
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+
+        boxes = torch.cat(boxes, dim=-2)
+        boxes = boxes[..., :merge_out_size, :merge_out_size]
+        return boxes
 
     def forward(
         self,
@@ -818,19 +827,19 @@ def forward(
         ) # (num_patches, config.num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)
-        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)   # (B, config.hidden_size, ~, ~)
-        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, ~, ~)
+        high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
+        med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2)   # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1)
+        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0)
 
         # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, ~, ~)
-        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, ~, ~)
-        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, ~, ~)
+        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3)
+        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2)
+        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
-        for layer_id in self.patch_encoder_hook_ids:
+        for i, layer_id in enumerate(self.patch_encoder_hook_ids):
 
             # a. extract hidden_state
             hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
@@ -845,12 +854,12 @@ def forward(
 
             # c. merge patches back together
             features = self._merge(
-                features[: B * 5 * 5], batch_size=B, padding=3
-            )
+                features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4,
+            ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
 
             # d. upsample
             features = self.upsample_intermediate[layer_id](features)
-            # (B, config.intermediate_feature_dims[layer_id], ~, ~)
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i))
 
             intermediate_features.append(features)
 
@@ -868,16 +877,25 @@ def forward(
         # skipped, no merge required with low res image
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~)
+        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
 
         # STEP 7: return these features
         last_hidden_state =  [
-            *intermediate_features,
-            high_res_features,
-            med_res_features,
-            low_res_features,
-            image_features,
+            *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i)
+            high_res_features,      # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3)
+            med_res_features,       # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2)
+            low_res_features,       # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
+            image_features,         # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
         ]
+        # for i in last_hidden_state:
+            # ic(i.shape)
+        # exit()
+
+        #  768, 384, 192,  96, 48, 48 - image_size=1536
+        #  384, 192,  96,  48, 24, 24 - image_size=768 (ideal)
+        #  288, 144,  72,  24, 24, 24 - image_size=768 (practical)
+        # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal)
+        # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical)
 
         hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
@@ -951,6 +969,11 @@ def forward(
 
         global_features = self.global_neck(global_features)
 
+        ic(last_hidden_state.shape)
+        ic(global_features.shape)
+
+        # exit()
+
         last_hidden_state = last_hidden_state.reshape_as(global_features)
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
@@ -1107,7 +1130,15 @@ def __init__(self, config: DepthProConfig) -> None:
         for i, feature_dim in enumerate(config.intermediate_feature_dims):
             if i == 0:
                 # no projection for final intermediate layer
-                proj = nn.Identity()
+                if feature_dim == config.decoder_hidden_size:
+                    proj = nn.Identity()
+                else:
+                    proj = nn.Conv2d(
+                        in_channels=feature_dim,
+                        out_channels=config.decoder_hidden_size,
+                        kernel_size=1,
+                        bias=False,
+                    )
                 fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
             else:
                 proj = nn.Conv2d(
@@ -1124,6 +1155,10 @@ def __init__(self, config: DepthProConfig) -> None:
             self.intermediate_fusion.append(fusion)
 
     def forward(self, hidden_states):
+        ic("Start of Decoder")
+
+        for i in hidden_states:
+            ic(i.shape)
 
         # STEP 1: extract features
 
@@ -1492,7 +1527,9 @@ def forward(
             return_dict=True,
         )
         last_hidden_state = depth_pro_outputs[0]
+        ic(last_hidden_state.shape)
         predicted_depth = self.head(last_hidden_state)
+        ic(predicted_depth.shape)
 
         if not return_dict:
             if loss is None:

From 00e4aa3b7bb04324cd08f2f87a2a34f4033fccca Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 11:04:58 +0500
Subject: [PATCH 14/72] restructure encoder for custom configuration

---
 .../depth_pro/configuration_depth_pro.py      |  21 +-
 .../models/depth_pro/modeling_depth_pro.py    | 842 ++++++++----------
 2 files changed, 395 insertions(+), 468 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d9f973639ad0fd..0558309004171f 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -59,6 +59,7 @@ class DepthProConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
+            TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
@@ -130,13 +131,11 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        patch_encoder_hook_ids = [5, 11],
+        intermediate_hook_ids = [11, 5],
         intermediate_feature_dims = [256, 256],
-        high_res_feature_dims = 512,
-        med_res_feature_dims = 1024,
-        low_res_feature_dims = 1024,
-        image_feature_dims = 1024,
-        global_feature_dims = 1024,
+        scaled_images_ratios = [0.25, 0.5, 1],
+        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
+        scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
         **kwargs,
@@ -167,12 +166,10 @@ def __init__(
         )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.patch_encoder_hook_ids = patch_encoder_hook_ids
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
+        self.intermediate_hook_ids = intermediate_hook_ids
         self.intermediate_feature_dims = intermediate_feature_dims
-        self.high_res_feature_dims = high_res_feature_dims
-        self.med_res_feature_dims = med_res_feature_dims
-        self.low_res_feature_dims = low_res_feature_dims
-        self.image_feature_dims = image_feature_dims
-        self.global_feature_dims = global_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 316afe444fbb62..9f146177402c00 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -226,7 +226,6 @@ def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
                 "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -617,11 +616,40 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.decoder_hidden_size = config.decoder_hidden_size
-        self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
+
+        self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
 
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
         self.out_size = config.patch_size // config.patch_embeddings_size
-        self.seq_len = self.out_size ** 2
+        self.seq_len = self.out_size ** 2 # each patch is flattened
+
+        # config.scaled_images_ratios is sorted
+        if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={config.scaled_images_ratios} "
+                "should be sorted from low to high"
+            )
+
+        # lowest image resolution is greator than the patch_size
+        if config.scaled_images_ratios[0] * config.image_size < config.patch_size:
+            raise ValueError(
+                "Image cannot be scaled to a size less than patch_size. "
+                f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable "
+                f"to the given patch_size={config.patch_size}."
+            )
+
+        # patch_size should be a divisible by patch_embeddings_size
+        # else it raises an exception in DepthProViTPatchEmbeddings
+        if config.patch_size % config.patch_embeddings_size != 0:
+            raise ValueError(
+                f"patch_size={config.patch_size} should be divisible "
+                f"by patch_embeddings_size={config.patch_embeddings_size}."
+            )
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
@@ -629,6 +657,17 @@ def __init__(self, config: DepthProConfig) -> None:
         # image encoder
         self.image_encoder = DepthProViT(config)
 
+        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
+        self.upsample_scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(self.scaled_images_feature_dims):
+            upsample_block = DepthProUpsampleBlock(
+                input_dims=config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.upsample_scaled_images.append(upsample_block)
+
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
         for i, feature_dims in enumerate(self.intermediate_feature_dims):
@@ -637,42 +676,33 @@ def __init__(self, config: DepthProConfig) -> None:
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=1+len(self.intermediate_feature_dims)-i,
+                n_upsample_layers=2+i,
             )
-
             self.upsample_intermediate.append(upsample_block)
 
-        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
-        self.upsample_high_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.high_res_feature_dims,
-            output_dims=config.high_res_feature_dims,
-            n_upsample_layers=1,
-        )
-        self.upsample_med_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.med_res_feature_dims,
-            output_dims=config.med_res_feature_dims,
-            n_upsample_layers=1,
-        )
-        self.upsample_low_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.low_res_feature_dims,
-            output_dims=config.low_res_feature_dims,
-            n_upsample_layers=1,
-        )
-
         # upsampling image features - (6) in diagram
         self.upsample_image = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
             intermediate_dims=config.hidden_size,
-            output_dims=config.image_feature_dims,
+            output_dims=config.scaled_images_feature_dims[0],
             n_upsample_layers=1,
             use_proj=False,
             bias=True,
         )
 
+        # for STEP 7: fuse low_res and image features
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0]*2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
     def _interpolate(self, pixel_values, scale_factor):
+        if scale_factor == 1:
+            return pixel_values
         return nn.functional.interpolate(
             pixel_values,
             size=None,
@@ -682,6 +712,10 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
+        if pixel_values.shape[-1] == self.config.patch_size:
+            # create patches only if scaled image is not already equal to patch size
+            return pixel_values
+
         patch_size = self.config.patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
@@ -712,7 +746,11 @@ def _reshape_feature(
         return hidden_states
 
     def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
-        """Merge the patched input into a image with sliding window."""
+        if batch_size == x.shape[0]:
+            # merge only if the patches were created from this scaled image
+            # pathces are not created when scaled image size is equal to patch size
+            return x
+
         # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
         box_size = int(math.sqrt(x.shape[0] // batch_size))
 
@@ -771,28 +809,35 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        # TODO validate: H = W = image_size
-        # TODO validate: C = num_channels
-        # TODO validate: image_size = patch_size * 4
+        if not (H == W == self.config.image_size):
+            raise ValueError(
+                f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config."
+            )
+
+        if not (C == self.config.num_channels):
+            raise ValueError(
+                f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."
+            )
 
         # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
 
         # STEP 1: create 3-level image
 
-        high_res = pixel_values                         # (B, config.num_channels, config.image_size, config.image_size)
-        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.num_channels, config.image_size//2, config.image_size//2)
-        low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4)
+        scaled_images = []
+        for ratio in self.scaled_images_ratios:
+            scaled_images.append(self._interpolate(pixel_values, ratio))
+            # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.num_channels, config.patch_size, config.patch_size)
-        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.num_channels, config.patch_size, config.patch_size)
-        low_res_patches = low_res                       # (-1, config.num_channels, config.patch_size, config.patch_size)
-
-        patches = torch.cat(
-            (high_res_patches, med_res_patches, low_res_patches),
-            dim=0,
-        ) # (num_patches, config.num_channels, config.patch_size, config.patch_size)
+        for i in range(self.n_scaled_images):
+            scaled_images[i] = self._patch(
+                scaled_images[i],
+                overlap_ratio=self.scaled_images_overlap_ratios[i],
+            )
+        scaled_images_num_patches = [len(i) for i in scaled_images]
+        patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first
+        # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -803,8 +848,13 @@ def forward(
             output_hidden_states=True, # required for intermediate features
             return_dict=True,
         )
+        scaled_images_last_hidden_state = torch.split_with_sizes(
+            patch_encodings.last_hidden_state,
+            scaled_images_num_patches[::-1]
+        )[::-1] # -1 as patch encoder expects high res patches first
+
         image_encodings = self.image_encoder(
-            pixel_values=low_res_patches,
+            pixel_values=scaled_images[0], # provide least resolution image
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -813,89 +863,87 @@ def forward(
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
-        # a. extract hidden_state
-        hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
+        scaled_images_features = []
+        for i in range(self.n_scaled_images):
+            # a. extract hidden_state
+            hidden_state = scaled_images_last_hidden_state[i]
+            # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
 
-        # b. reshape back to image like
-        features = self._reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
-        high_res_features, med_res_features, low_res_features = torch.split(
-            features,
-            [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
-            dim=0,
-        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
+            # b. reshape back to image like
+            features = self._reshape_feature(
+                hidden_state, self.out_size, self.out_size
+            ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
-        # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
-        med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2)   # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1)
-        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0)
+            # c. merge patches back together
+            features = self._merge(
+                features, batch_size=B, merge_out_size=self.out_size*2**i
+            ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
-        # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3)
-        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2)
-        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1)
+            # d. upsample
+            features = self.upsample_scaled_images[i](features)
+            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+
+            scaled_images_features.append(features)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
-        for i, layer_id in enumerate(self.patch_encoder_hook_ids):
+        for i in range(self.n_intermediate_hooks):
 
             # a. extract hidden_state
-            hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
-            # (num_patches, self.seq_len+1, config.hidden_size)
+            layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = patch_encodings.hidden_states[layer_id]
+            hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution
+            # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
             features = self._reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
+            ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = self._merge(
-                features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4,
-            ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
+                features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
+            ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
             # d. upsample
-            features = self.upsample_intermediate[layer_id](features)
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i))
+            features = self.upsample_intermediate[i](features)
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
+        ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        # skipped, no merge required with low res image
+        image_features = self._merge(
+            image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
+        ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
-
-        # STEP 7: return these features
-        last_hidden_state =  [
-            *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i)
-            high_res_features,      # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3)
-            med_res_features,       # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2)
-            low_res_features,       # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
-            image_features,         # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
+        image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
+
+        # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
+        # fuses image_features with lowest resolution features as they are of same size
+        scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
+        scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
+
+        # STEP 8: return these features in order of increasing size as what decoder expects
+        last_hidden_state = [
+            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+            *scaled_images_features, 
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
+            *intermediate_features,
         ]
-        # for i in last_hidden_state:
-            # ic(i.shape)
-        # exit()
-
-        #  768, 384, 192,  96, 48, 48 - image_size=1536
-        #  384, 192,  96,  48, 24, 24 - image_size=768 (ideal)
-        #  288, 144,  72,  24, 24, 24 - image_size=768 (practical)
-        # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal)
-        # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical)
 
         hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
@@ -910,84 +958,133 @@ def forward(
         )
 
 
-class DepthProFOVModel(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
+class DepthProPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DepthProViTSwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
         self.config = config
-        self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.encoder = DepthProEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
 
-        self.encoder = DepthProViT(config)
-        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
-        self.global_neck = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True)
-        )
-        self.head = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
-        )
+    def get_input_embeddings(self):
+        embeddings = {
+            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
+            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
+        }
+        return embeddings
 
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # TODO
+    # @add_code_sample_docstrings(
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     modality="vision",
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
     def forward(
         self,
-        pixel_values: torch.Tensor,
-        global_features: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        pixel_values = nn.functional.interpolate(
-            pixel_values,
-            size=None,
-            scale_factor=0.25,
-            mode="bilinear",
-            align_corners=False,
-        )
-        encoder_outputs = self.encoder(
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encodings = self.encoder(
             pixel_values,
-            head_mask=head_mask,
+            head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        last_hidden_state = encoder_outputs[0]
-
-        last_hidden_state = self.encoder_neck(last_hidden_state)
-
-        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
-        last_hidden_state = last_hidden_state.permute(0, 2, 1)
-
-        global_features = self.global_neck(global_features)
-
-        ic(last_hidden_state.shape)
-        ic(global_features.shape)
 
-        # exit()
-
-        last_hidden_state = last_hidden_state.reshape_as(global_features)
-        last_hidden_state = last_hidden_state + global_features
-        fov_output = self.head(last_hidden_state)
-        fov_output = fov_output.reshape(1)
-
-        if not return_dict:
-            head_outputs = (fov_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=fov_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+        return encodings
 
 
 # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
@@ -1075,325 +1172,109 @@ def forward(self, hidden_state, residual=None):
         return hidden_state
 
 
-# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage
-class DepthProDecoder(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters
+class DepthProFeatureFusionStage(nn.Module):
+    def __init__(self, config, num_layers):
         super().__init__()
-        self.config = config
-
-        # for STEP 2: fuse low_res and image features
-        self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=config.low_res_feature_dims+config.image_feature_dims,
-            out_channels=config.global_feature_dims,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True,
-        )
-
-        # for STEP 3: apply decoder block for global features
-        self.global_proj = nn.Conv2d(
-            in_channels=config.global_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.global_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 4: apply decoder block for med features
-        self.med_res_proj = nn.Conv2d(
-            in_channels=config.med_res_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.med_res_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 5: apply decoder block for high features
-        self.high_res_proj = nn.Conv2d(
-            in_channels=config.high_res_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.high_res_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 6: apply decoder block for intermediate features
-        self.intermediate_proj = nn.Sequential()
-        self.intermediate_fusion = nn.Sequential()
-        for i, feature_dim in enumerate(config.intermediate_feature_dims):
-            if i == 0:
-                # no projection for final intermediate layer
-                if feature_dim == config.decoder_hidden_size:
-                    proj = nn.Identity()
-                else:
-                    proj = nn.Conv2d(
-                        in_channels=feature_dim,
-                        out_channels=config.decoder_hidden_size,
-                        kernel_size=1,
-                        bias=False,
-                    )
-                fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
-            else:
-                proj = nn.Conv2d(
-                    in_channels=feature_dim,
-                    out_channels=config.decoder_hidden_size,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias=False,
-                )
-                fusion = DepthProFeatureFusionLayer(config)
-
-            self.intermediate_proj.append(proj)
-            self.intermediate_fusion.append(fusion)
+        self.num_layers = num_layers
+        self.layers = nn.ModuleList()
+        for _ in range(self.num_layers-1):
+            self.layers.append(DepthProFeatureFusionLayer(config))
+        # final layer doesnot require deconvolution
+        self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
 
     def forward(self, hidden_states):
-        ic("Start of Decoder")
-
-        for i in hidden_states:
-            ic(i.shape)
-
-        # STEP 1: extract features
-
-        intermediate_features = hidden_states[:-4]
-        # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384]
-        high_res_features = hidden_states[-4]
-        # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192]
-        med_res_features = hidden_states[-3]
-        # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96]
-        low_res_features = hidden_states[-2]
-        # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48]
-        image_features = hidden_states[-1]
-        # image_features.shape: [batch_size, config.image_feature_dims, 48, 48]
-
-        # STEP 2: fuse low_res and image features
-
-        global_features = torch.cat((low_res_features, image_features), dim=1)
-        global_features = self.fuse_image_with_low_res(global_features)
-        # global_features.shape: [batch_size, config.global_feature_dims, 48, 48]
-
-        # STEP 3: apply decoder block for global features
-
-        # apply projection: used by fusion now and then fov later
-        global_projected = self.global_proj(global_features)
-        # apply fusion: used by next projections and fusions
-        last_features = self.global_fusion(global_projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96]
-
-        # STEP 4: apply decoder block for med features
-
-        projected = self.med_res_proj(med_res_features)
-        last_features = self.med_res_fusion(last_features, projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192]
-
-        # STEP 5: apply decoder block for high features
-
-        projected = self.high_res_proj(high_res_features)
-        last_features = self.high_res_fusion(last_features, projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384]
-
-        # STEP 6: apply decoder block for intermediate features
-
-        for (features, proj_layer, fusion_layer) in zip(
-            # reversed becuase decoding is applied from last features to first features
-            intermediate_features[::-1],
-            self.intermediate_proj[::-1],
-            self.intermediate_fusion[::-1],
-        ):
-            projected = proj_layer(features)
-            last_features = fusion_layer(last_features, projected)
-            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
-            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
-
-        return last_features, global_projected
-
-
-class DepthProPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = DepthProConfig
-    base_model_prefix = "depth_pro"
-    main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DepthProViTSwiGLUFFN"]
-    _supports_sdpa = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-DEPTH_PRO_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEPTH_PRO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
-            for details.
-
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
+        if self.num_layers != len(hidden_states):
+            raise ValueError(
+                f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
+                f"doesnot match len(hidden_states)={len(hidden_states)}"
+            )
 
-@dataclass
-class DepthProModelOutput(BaseModelOutput):
-    """
-    Base class for model's outputs, with potential fov, hidden states and attentions.
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        # looping from the second layer to last layer
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
 
-    Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
-            Field of View Scaler.
-    """
-    fov: Optional[torch.FloatTensor] = None
+        return fused_hidden_state
 
 
-@add_start_docstrings(
-    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
-    DEPTH_PRO_START_DOCSTRING,
-)
-class DepthProModel(DepthProPreTrainedModel):
-    def __init__(self, config, use_fov_model=None):
-        super().__init__(config)
+class DepthProFOVModel(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
         self.config = config
-        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
-
-        # dinov2 (vit) like encoder
-        self.encoder = DepthProEncoder(config)
-        # dpt (vit) like decoder
-        self.decoder = DepthProDecoder(config)
-        # dinov2 (vit) like encoder
-        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        embeddings = {
-            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
-            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
-        }
-        if self.use_fov:
-            embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings
-        return embeddings
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
-            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
-            self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads)
+        self.encoder = DepthProViT(config)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.global_neck = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True)
+        )
+        self.head = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+        )
 
-    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # TODO
-    # @add_code_sample_docstrings(
-    #     checkpoint=_CHECKPOINT_FOR_DOC,
-    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
-    #     config_class=_CONFIG_FOR_DOC,
-    #     modality="vision",
-    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
-    # )
     def forward(
         self,
-        pixel_values: torch.FloatTensor,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        encodings = self.encoder(
+        pixel_values = nn.functional.interpolate(
             pixel_values,
-            head_mask,
+            size=None,
+            scale_factor=0.25,
+            mode="bilinear",
+            align_corners=False,
+        )
+        encoder_outputs = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
         )
+        last_hidden_state = encoder_outputs[0]
 
-        last_hidden_state = encodings.last_hidden_state
-        last_hidden_state, global_features = self.decoder(last_hidden_state)
+        last_hidden_state = self.encoder_neck(last_hidden_state)
 
-        if self.use_fov_model:
-            fov_encodings = self.fov_model(
-                pixel_values=pixel_values,
-                global_features=global_features.detach(),
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=True,
-            )
-            fov = fov_encodings.last_hidden_state
-            attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
-        else:
-            fov = None
-            attentions = encodings.attentions
-            hidden_states = encodings.hidden_states
+        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
+        last_hidden_state = last_hidden_state.permute(0, 2, 1)
+
+        global_features = self.global_neck(global_features)
+
+        ic(last_hidden_state.shape)
+        ic(global_features.shape)
+
+
+        last_hidden_state = last_hidden_state.reshape_as(global_features)
+        last_hidden_state = last_hidden_state + global_features
+        fov_output = self.head(last_hidden_state)
+        fov_output = fov_output.reshape(1)
 
         if not return_dict:
-            outputs = (last_hidden_state, fov, hidden_states, attentions)
-            outputs = (i for i in outputs if i is not None)
-            return outputs
+            head_outputs = (fov_output,)
+            return head_outputs + encoder_outputs[1:]
 
-        return DepthProModelOutput(
-            last_hidden_state=last_hidden_state,
-            fov=fov,
-            hidden_states=hidden_states,
-            attentions=attentions,
+        return BaseModelOutput(
+            last_hidden_state=fov_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
         )
 
 
@@ -1422,7 +1303,6 @@ def __init__(self, config):
             nn.ReLU(),
         )
 
-
     def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         predicted_depth = self.head(hidden_states)
         predicted_depth = predicted_depth.squeeze(dim=1)
@@ -1450,14 +1330,45 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
     def __init__(self, config, use_fov_model=None):
         super().__init__(config)
+        self.config = config
         self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
-        self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model)
+        # dinov2 (vit) like encoders
+        self.depth_pro = DepthProModel(config)
+
+        # project hidden states from encoder to match expected inputs in fusion stage
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.decoder_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+
+        # dpt (vit) like fusion stage
+        self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers)
+
+        # depth estimation head
         self.head = DepthProDepthEstimationHead(config)
 
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
+
         # Initialize weights and apply final processing
         self.post_init()
 
+
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1476,6 +1387,7 @@ def forward(
         Returns:
 
         Examples:
+        TODO
         ```python
         >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
         >>> import torch
@@ -1526,21 +1438,39 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=True,
         )
-        last_hidden_state = depth_pro_outputs[0]
-        ic(last_hidden_state.shape)
-        predicted_depth = self.head(last_hidden_state)
-        ic(predicted_depth.shape)
+        last_hidden_state = depth_pro_outputs.last_hidden_state
+        last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)]
+        fused_state = self.fusion_stage(last_hidden_state)
+        predicted_depth = self.head(fused_state)
+
+        if self.use_fov_model:
+            # use lowest scaled image features for fov model
+            global_features = last_hidden_state[0].detach()
+            fov_encodings = self.fov_model(
+                pixel_values=pixel_values,
+                global_features=global_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+            )
+            fov = fov_encodings.last_hidden_state
+            attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
+            hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+        else:
+            fov = None
+            attentions = depth_pro_outputs.attentions
+            hidden_states = depth_pro_outputs.hidden_states
 
         if not return_dict:
-            if loss is None:
-                return (predicted_depth,) + depth_pro_outputs[1:]
-            else:
-                return (loss, predicted_depth) + depth_pro_outputs[1:]
+            outputs = (predicted_depth, fov, hidden_states, attentions)
+            outputs = (i for i in outputs if i is not None)
+            return outputs
 
         return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
-            fov=depth_pro_outputs.fov,
-            hidden_states=depth_pro_outputs.hidden_states,
-            attentions=depth_pro_outputs.attentions,
+            fov=fov,
+            hidden_states=hidden_states,
+            attentions=attentions,
         )

From 6be242ce30589132e71bd437fd6016827c3d8b6a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 13:51:45 +0500
Subject: [PATCH 15/72] make fov model compatible with custom config

---
 .../depth_pro/configuration_depth_pro.py      |   2 +
 .../models/depth_pro/modeling_depth_pro.py    | 267 ++++++++++--------
 2 files changed, 150 insertions(+), 119 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 0558309004171f..8e197dbd0dab41 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -138,6 +138,7 @@ def __init__(
         scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
+        num_fov_head_layers=2,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -168,6 +169,7 @@ def __init__(
         self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
         self.intermediate_feature_dims = intermediate_feature_dims
         self.scaled_images_ratios = scaled_images_ratios
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 9f146177402c00..0ddd503c4cc94a 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -610,6 +610,97 @@ def forward(self, features):
         projected = self.proj(features)
         return self.upsample_blocks(projected)
 
+
+def interpolate(pixel_values, scale_factor):
+    return nn.functional.interpolate(
+        pixel_values,
+        size=None,
+        scale_factor=scale_factor,
+        mode="bilinear",
+        align_corners=False,
+    )
+
+def patch(pixel_values, patch_size, overlap_ratio):
+    """Creates Patches from Batch."""
+    B, C, W, H = pixel_values.shape
+
+    if W == H == patch_size:
+        # create patches only if scaled image is not already equal to patch size
+        return pixel_values
+
+    stride = int(patch_size * (1 - overlap_ratio))
+
+    # (B, C, W, H)
+    patches = torch.nn.functional.unfold(
+        pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
+    )
+    # patches.shape (B, patch_size**2 * C, num_patches)
+    patches = patches.permute(2, 0, 1)
+    # patches.shape (num_patches, B, patch_size**2 * C)
+    patches = patches.reshape(-1, C, patch_size, patch_size)
+    # patches.shape (B * num_patches, C, patch_size, patch_size)
+
+    return patches
+
+def reshape_feature(hidden_states, width, height):
+    """Discard class token and reshape 1D feature map to a 2D grid."""
+    B, _, C = hidden_states.shape
+    # (B, WH+1, C)
+    hidden_states = hidden_states[:, 1:, :] # remove class token
+    # (B, WH, C)
+    hidden_states = hidden_states.reshape(B, width, height, C)
+    # (B, W, H, C)
+    hidden_states = hidden_states.permute(0, 3, 1, 2)
+    # (B, C, W, H)
+    return hidden_states
+
+def merge(patches, batch_size, merge_out_size):
+    """Recreates Batch from Patches."""
+    num_patches, num_channels, out_size, out_size = patches.shape
+
+    if num_patches == batch_size:
+        # merge only if the patches were created from scaled image
+        # patches are not created when scaled image size is equal to patch size
+        return patches
+
+    box_size = int(math.sqrt(num_patches // batch_size))
+    """
+    merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+    padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
+    """
+    padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 )
+
+    i = 0
+    boxes = []
+    for h in range(box_size):
+        boxes_in_row = []
+        for w in range(box_size):
+            box = patches[batch_size * i : batch_size * (i + 1)]
+
+            if h != 0:
+                # remove pad from height if box is not at top border
+                box = box[..., padding:, :]
+            if w != 0:
+                # remove pad from width if box is not at left border
+                box = box[..., :, padding:]
+            if h != box_size - 1:
+                # remove pad from height if box is not at bottom border
+                box = box[..., :box.shape[-2]-padding, :]
+            if w != box_size - 1:
+                # remove pad from width if box is not at right border
+                box = box[..., :, :box.shape[-1]-padding]
+
+            boxes_in_row.append(box)
+            i += 1
+
+        boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+        boxes.append(boxes_in_row)
+
+    boxes = torch.cat(boxes, dim=-2)
+    boxes = boxes[..., :merge_out_size, :merge_out_size]
+    return boxes
+
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -700,96 +791,6 @@ def __init__(self, config: DepthProConfig) -> None:
             bias=True,
         )
 
-    def _interpolate(self, pixel_values, scale_factor):
-        if scale_factor == 1:
-            return pixel_values
-        return nn.functional.interpolate(
-            pixel_values,
-            size=None,
-            scale_factor=scale_factor,
-            mode="bilinear",
-            align_corners=False,
-        )
-
-    def _patch(self, pixel_values, overlap_ratio):
-        if pixel_values.shape[-1] == self.config.patch_size:
-            # create patches only if scaled image is not already equal to patch size
-            return pixel_values
-
-        patch_size = self.config.patch_size
-        stride = int(patch_size * (1 - overlap_ratio))
-
-        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
-        patches = torch.nn.functional.unfold(
-            pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
-        )
-        # patches.shape (B, -1, num_patches)
-        patches = patches.permute(2, 0, 1)
-        # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size)
-        # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size)
-
-        return patches
-
-    def _reshape_feature(
-        self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
-    ):
-        """Discard class token and reshape 1D feature map to a 2D grid."""
-        b, hw, c = hidden_states.shape
-
-        # Remove class token.
-        if cls_token_offset > 0:
-            hidden_states = hidden_states[:, cls_token_offset:, :]
-
-        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
-        hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
-        return hidden_states
-
-    def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
-        if batch_size == x.shape[0]:
-            # merge only if the patches were created from this scaled image
-            # pathces are not created when scaled image size is equal to patch size
-            return x
-
-        # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
-        box_size = int(math.sqrt(x.shape[0] // batch_size))
-
-        """
-        merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
-        padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
-        """
-        padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 )
-
-        i = 0
-        boxes = []
-        for h in range(box_size):
-            boxes_in_row = []
-            for w in range(box_size):
-                box = x[batch_size * i : batch_size * (i + 1)]
-
-                if h != 0:
-                    # remove pad from height if box is not at top border
-                    box = box[..., padding:, :]
-                if w != 0:
-                    # remove pad from width if box is not at left border
-                    box = box[..., :, padding:]
-                if h != box_size - 1:
-                    # remove pad from height if box is not at bottom border
-                    box = box[..., :box.shape[-2]-padding, :]
-                if w != box_size - 1:
-                    # remove pad from width if box is not at right border
-                    box = box[..., :, :box.shape[-1]-padding]
-
-                boxes_in_row.append(box)
-                i += 1
-
-            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
-            boxes.append(boxes_in_row)
-
-        boxes = torch.cat(boxes, dim=-2)
-        boxes = boxes[..., :merge_out_size, :merge_out_size]
-        return boxes
-
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -825,14 +826,15 @@ def forward(
 
         scaled_images = []
         for ratio in self.scaled_images_ratios:
-            scaled_images.append(self._interpolate(pixel_values, ratio))
+            scaled_images.append(interpolate(pixel_values, ratio))
             # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
 
         # STEP 2: create patches
 
         for i in range(self.n_scaled_images):
-            scaled_images[i] = self._patch(
+            scaled_images[i] = patch(
                 scaled_images[i],
+                patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
         scaled_images_num_patches = [len(i) for i in scaled_images]
@@ -870,12 +872,12 @@ def forward(
             # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
-            features = self._reshape_feature(
+            features = reshape_feature(
                 hidden_state, self.out_size, self.out_size
             ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
             # c. merge patches back together
-            features = self._merge(
+            features = merge(
                 features, batch_size=B, merge_out_size=self.out_size*2**i
             ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
@@ -897,14 +899,14 @@ def forward(
             # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
-            features = self._reshape_feature(
+            features = reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
             ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
-            features = self._merge(
+            features = merge(
                 features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
             ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
@@ -920,12 +922,12 @@ def forward(
         hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
-        image_features = self._reshape_feature(
+        image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
         ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = self._merge(
+        image_features = merge(
             image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
         ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
@@ -1206,18 +1208,39 @@ def __init__(self, config: DepthProConfig) -> None:
         self.hidden_size = config.hidden_size
         self.decoder_hidden_size = config.decoder_hidden_size
 
+        self.out_size = config.patch_size // config.patch_embeddings_size
+
         self.encoder = DepthProViT(config)
         self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
-        self.head = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+
+        if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0:
+            raise ValueError(
+                f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
+                "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0"
+            )
+
+        # create initial head layers
+        self.head = nn.Sequential()
+        for i in range(config.num_fov_head_layers):
+            self.head.append(
+                nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+            )
+            self.head.append(nn.ReLU(True))
+        # calculate expected shapes to finally generate a scalar output from final head layer
+        final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        self.head.append(
+            nn.Conv2d(
+                in_channels=final_in_channels,
+                out_channels=1,
+                kernel_size=final_kernal_size,
+                stride=1,
+                padding=0
+            )
         )
 
     def forward(
@@ -1235,34 +1258,40 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        pixel_values = nn.functional.interpolate(
+        B, C, W, H = pixel_values.shape
+
+        # follow the steps same as with image features in DepthProEncoder
+        pixel_values = interpolate(
             pixel_values,
-            size=None,
-            scale_factor=0.25,
-            mode="bilinear",
-            align_corners=False,
+            scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image
         )
-        encoder_outputs = self.encoder(
+        patches = patch(
             pixel_values,
+            patch_size=self.config.patch_size,
+            overlap_ratio=self.config.scaled_images_overlap_ratios[0],
+        )
+        encoder_outputs = self.encoder(
+            patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         last_hidden_state = encoder_outputs[0]
-
         last_hidden_state = self.encoder_neck(last_hidden_state)
-
-        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
-        last_hidden_state = last_hidden_state.permute(0, 2, 1)
+        last_hidden_state = reshape_feature(
+            last_hidden_state,
+            width=self.out_size,
+            height=self.out_size
+        )
+        last_hidden_state = merge(
+            last_hidden_state,
+            batch_size=B,
+            merge_out_size=self.out_size,
+        )
 
         global_features = self.global_neck(global_features)
 
-        ic(last_hidden_state.shape)
-        ic(global_features.shape)
-
-
-        last_hidden_state = last_hidden_state.reshape_as(global_features)
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(1)

From 01891085f0961ea28049616abed63a8bd9cb2f05 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 13:54:43 +0500
Subject: [PATCH 16/72] replace word "decoder" with "fusion"

---
 .../depth_pro/configuration_depth_pro.py      | 10 ++---
 .../models/depth_pro/modeling_depth_pro.py    | 44 +++++++++----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 8e197dbd0dab41..f124d3e5b71ab7 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -59,7 +59,7 @@ class DepthProConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
-            TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size
+            TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
@@ -110,7 +110,7 @@ class DepthProConfig(PretrainedConfig):
     def __init__(
         self,
         hidden_size=1024,
-        decoder_hidden_size=256,
+        fusion_hidden_size=256,
         num_hidden_layers=24,
         num_attention_heads=16,
         mlp_ratio=4,
@@ -136,7 +136,7 @@ def __init__(
         scaled_images_ratios = [0.25, 0.5, 1],
         scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
         scaled_images_feature_dims = [1024, 1024, 512],
-        use_batch_norm_in_decoder=False,
+        use_batch_norm_in_fusion=False,
         use_fov_model=False,
         num_fov_head_layers=2,
         **kwargs,
@@ -144,7 +144,7 @@ def __init__(
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
-        self.decoder_hidden_size = decoder_hidden_size
+        self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.mlp_ratio = mlp_ratio
@@ -167,7 +167,7 @@ def __init__(
         )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
+        self.use_batch_norm_in_fusion = use_batch_norm_in_fusion
         self.use_fov_model = use_fov_model
         self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 0ddd503c4cc94a..0ac35b582d7fca 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -706,7 +706,7 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.fusion_hidden_size = config.fusion_hidden_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -762,7 +762,7 @@ def __init__(self, config: DepthProConfig) -> None:
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
         for i, feature_dims in enumerate(self.intermediate_feature_dims):
-            intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
+            intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims
             upsample_block = DepthProUpsampleBlock(
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
@@ -939,7 +939,7 @@ def forward(
         scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
         scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
 
-        # STEP 8: return these features in order of increasing size as what decoder expects
+        # STEP 8: return these features in order of increasing size as what fusion expects
         last_hidden_state = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features, 
@@ -1094,8 +1094,8 @@ class DepthProResidualLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.use_batch_norm = config.use_batch_norm_in_decoder
-        self.hidden_size = config.decoder_hidden_size
+        self.use_batch_norm = config.use_batch_norm_in_fusion
+        self.hidden_size = config.fusion_hidden_size
 
         self.activation1 = nn.ReLU()
         self.convolution1 = nn.Conv2d(
@@ -1151,15 +1151,15 @@ def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
 
         if self.use_deconv:
             self.deconv = nn.ConvTranspose2d(
-                in_channels=config.decoder_hidden_size,
-                out_channels=config.decoder_hidden_size,
+                in_channels=config.fusion_hidden_size,
+                out_channels=config.fusion_hidden_size,
                 kernel_size=2,
                 stride=2,
                 padding=0,
                 bias=False,
             )
 
-        self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True)
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
         self.skip_add = nn.quantized.FloatFunctional()
 
     def forward(self, hidden_state, residual=None):
@@ -1206,32 +1206,32 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.fusion_hidden_size = config.fusion_hidden_size
 
         self.out_size = config.patch_size // config.patch_embeddings_size
 
         self.encoder = DepthProViT(config)
-        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
 
-        if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0:
+        if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
             raise ValueError(
-                f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
-                "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0"
+                f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
+                "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0"
             )
 
         # create initial head layers
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
             self.head.append(
-                nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+                nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1)
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
@@ -1311,7 +1311,7 @@ class DepthProDepthEstimationHead(nn.Module):
     """
     The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
     This module comprises a sequence of convolutional and transposed convolutional layers
-    that process the feature map from the decoder to produce a single-channel depth map.
+    that process the feature map from the fusion to produce a single-channel depth map.
     Key operations include dimensionality reduction and upsampling to match the input resolution.
     """
 
@@ -1319,7 +1319,7 @@ def __init__(self, config):
         super().__init__()
         self.config = config
 
-        features = config.decoder_hidden_size
+        features = config.fusion_hidden_size
         self.head = nn.Sequential(
             nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
@@ -1369,14 +1369,14 @@ def __init__(self, config, use_fov_model=None):
         combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
         self.projections = nn.ModuleList()
         for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size:
+            if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size:
                 # projection for last layer can be ignored if input and output channels already match
                 self.projections.append(nn.Identity())
             else:
                 self.projections.append(
                     nn.Conv2d(
                         in_channels=in_channels,
-                        out_channels=config.decoder_hidden_size,
+                        out_channels=config.fusion_hidden_size,
                         kernel_size=3,
                         stride=1,
                         padding=1,
@@ -1385,8 +1385,8 @@ def __init__(self, config, use_fov_model=None):
                 )
 
         # dpt (vit) like fusion stage
-        self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers)
+        self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers)
 
         # depth estimation head
         self.head = DepthProDepthEstimationHead(config)

From 7614e1a709c14c8f9e32730fe240e401ae023ec3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 24 Nov 2024 13:57:36 +0500
Subject: [PATCH 17/72] weight conversion script

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
new file mode 100644
index 00000000000000..38b7a7853d76d6
--- /dev/null
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -0,0 +1,344 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DepthPro checkpoints from the original repository.
+
+URL: https://huggingface.co/apple/DepthPro/tree/main
+"""
+
+import argparse
+import json
+from pathlib import Path
+import re
+
+import requests
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.utils import logging
+
+# TODO: import directly from transformers
+from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
+from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def create_vit_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # patch embedding layer
+    rename_keys.append(("cls_token", "embeddings.cls_token"))
+    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
+    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
+
+    for i in range(config.num_hidden_layers):
+        # layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
+        # MLP
+        if config.use_swiglu_ffn:
+            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
+        else:
+            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
+        # layerscale
+        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
+        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
+        # attention projection layer
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
+
+    # final layernorm
+    rename_keys.append(("norm.weight", "layernorm.weight"))
+    rename_keys.append(("norm.bias", "layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    state_dict_keys = state_dict.keys()
+    for key in list(state_dict_keys):
+        if "qkv" in key:
+            in_proj = state_dict.pop(key)
+            q, k, v = torch.split(in_proj, config.hidden_size, dim=0)
+
+            if "fov" in key:
+                key = key.replace('fov.encoder.0', 'fov_model.encoder')
+            else:
+                key = "depth_pro." + key
+
+            key = key.replace("blocks", "encoder.layer")
+            state_dict[key.replace("attn.qkv", "attention.attention.query")] = q
+            state_dict[key.replace("attn.qkv", "attention.attention.key")] = k
+            state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
+    return state_dict
+
+# hard coded upsample keys
+def update_hard_coded_keys(state_dict):
+    mapping = [
+        # upsamples
+        ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'),
+        ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'),
+        ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'),
+        ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'),
+        ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'),
+        ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'),
+        ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'),
+        ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'),
+        ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'),
+        ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'),
+        ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'),
+        ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'),
+        ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'),
+        ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'),
+        ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'),
+
+        # neck
+        ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"),
+        ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"),
+        ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"),
+        ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"),
+    ]
+    for src, dest in mapping:
+        state_dict[dest] = state_dict.pop(src)
+    
+    return state_dict
+
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return image
+
+
+
+@torch.no_grad()
+def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DepthPro structure.
+    """
+
+    # define default DepthPro configuration
+    config = DepthProConfig()
+
+    # load original weights from huggingface hub
+    # TODO: download from hub
+    # file_path = hf_hub_download(repo_id, filename)
+    file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
+    state_dict = torch.load(file_path, weights_only=True)
+
+    # enumerate fusion layers
+    n_scaled_images = len(config.scaled_images_ratios)       # 3
+    n_intermediate_hooks = len(config.intermediate_hook_ids) # 2
+    n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5
+
+    # 1. keys for vit encoders
+    vit_rename_keys = create_vit_rename_keys(config)
+    for src_prefix, dest_prefix in [
+        ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"),
+        ("encoder.image_encoder", "depth_pro.encoder.image_encoder"),
+        ("fov.encoder.0", "fov_model.encoder"),
+    ]:
+        for src, dest in vit_rename_keys:
+            src = src_prefix + "." + src
+            dest = dest_prefix + "." + dest
+            state_dict[dest] = state_dict.pop(src)
+
+    # 2. qkv keys for vit encoders
+    state_dict = read_in_q_k_v(state_dict, config)
+
+    # 3. hard coded mapping
+    state_dict = update_hard_coded_keys(state_dict)
+
+
+    for key in list(state_dict.keys()):
+
+        # 4. final depth estimation head
+        if key.startswith("head."):
+            new_key = "head." + key
+
+        # 5. fov model head
+        elif key.startswith("fov.head."):
+            new_key = key.replace("fov", 'fov_model')
+
+        # 6. projections between encoder and fusion
+        elif "decoder.convs." in key:
+            n = re.findall(r'\d+', key)[0] # find digit inside string
+            n = n_fusion_layers - int(n) - 1
+            new_key = f"projections.{n}.weight"
+
+        # 7. fuse low res with image features
+        elif "encoder.fuse_lowres." in key:
+            new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res")
+
+        # 8. fusion stage (decoder)
+        elif key.startswith("decoder.fusions."):
+            new_key = key.replace("decoder.fusions.", "fusion_stage.layers.")
+            new_key = new_key.replace("resnet1", "residual_layer1")
+            new_key = new_key.replace("resnet2", "residual_layer2")
+            new_key = new_key.replace("residual.1", "convolution1")
+            new_key = new_key.replace("residual.3", "convolution2")
+            new_key = new_key.replace("out_conv", "projection")
+
+            n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by .
+            n = n_with_dots[1:-1]
+            n = n_fusion_layers - int(n) - 1
+            new_key = new_key.replace(n_with_dots, f".{n}.")
+
+        else:
+            continue
+
+        state_dict[new_key] = state_dict.pop(key)        
+
+    model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
+    model.load_state_dict(state_dict)
+
+    exit()
+
+    # ----------------
+
+    
+
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if "w12" in key:
+            key = key.replace("w12", "weights_in")
+        if "w3" in key:
+            key = key.replace("w3", "weights_out")
+        state_dict[key] = val
+
+    # load HuggingFace model
+    if image_classifier:
+        model = Dinov2ForImageClassification(config).eval()
+        model.dinov2.load_state_dict(state_dict)
+        model_name_to_classifier_dict_url = {
+            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
+            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
+            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
+            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
+        }
+        url = model_name_to_classifier_dict_url[model_name]
+        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
+        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
+    else:
+        model = Dinov2Model(config).eval()
+        model.load_state_dict(state_dict)
+
+    # load image
+    image = prepare_img()
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
+                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
+            ),
+        ]
+    )
+
+    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+
+    processor = BitImageProcessor(
+        size={"shortest_edge": 256},
+        resample=PILImageResampling.BICUBIC,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
+    )
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    with torch.no_grad():
+        outputs = model(pixel_values, output_hidden_states=True)
+        original_outputs = original_model(pixel_values)
+
+    # assert values
+    if image_classifier:
+        print("Predicted class:")
+        class_idx = outputs.logits.argmax(-1).item()
+        print(model.config.id2label[class_idx])
+    else:
+        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
+        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_hf_name = {
+            "dinov2_vits14": "dinov2-small",
+            "dinov2_vitb14": "dinov2-base",
+            "dinov2_vitl14": "dinov2-large",
+            "dinov2_vitg14": "dinov2-giant",
+            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
+            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
+            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
+            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
+        }
+
+        name = model_name_to_hf_name[model_name]
+        model.push_to_hub(f"facebook/{name}")
+        processor.push_to_hub(f"facebook/{name}")
+
+
+convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False)
+exit()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
+    )
+    parser.add_argument(
+        "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

From 7d323ce91f071cc5ed6b0c36f407866e545dbe65 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:41:13 +0500
Subject: [PATCH 18/72] fix fov squeeze

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 0ac35b582d7fca..eb8bf02f83d160 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1294,7 +1294,7 @@ def forward(
 
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
-        fov_output = fov_output.reshape(1)
+        fov_output = fov_output.reshape(B)
 
         if not return_dict:
             head_outputs = (fov_output,)

From 6aaa59e943c5d5fd5c301404aaa47e8db1402355 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:42:18 +0500
Subject: [PATCH 19/72] update conversion script (without test)

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 160 +++++++-----------
 1 file changed, 59 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
index 38b7a7853d76d6..de7bf395a35552 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -83,6 +83,7 @@ def create_vit_rename_keys(config):
     # fmt: on
     return rename_keys
 
+
 # we split up the matrix of each encoder layer into queries, keys and values
 def read_in_q_k_v(state_dict, config):
     state_dict_keys = state_dict.keys()
@@ -102,6 +103,7 @@ def read_in_q_k_v(state_dict, config):
             state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
     return state_dict
 
+
 # hard coded upsample keys
 def update_hard_coded_keys(state_dict):
     mapping = [
@@ -134,13 +136,24 @@ def update_hard_coded_keys(state_dict):
     return state_dict
 
 
-
 # We will verify our results on an image of cute cats
-def prepare_img():
+def inference_test(processor, model):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
 
+    inputs = processor(image)
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    predicted_depth = outputs.predicted_depth
+    fov = outputs.fov
+
+    predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov)
+
+    print("predicted_depth.shape:", predicted_depth.shape)
+    print("fov.shape:", fov.shape)
+    print("fov:", fov)
+    print("Inference was Successfull!")
 
 
 @torch.no_grad()
@@ -150,12 +163,10 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     """
 
     # define default DepthPro configuration
-    config = DepthProConfig()
+    config = DepthProConfig(use_fov_model=True)
 
     # load original weights from huggingface hub
-    # TODO: download from hub
-    # file_path = hf_hub_download(repo_id, filename)
-    file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
+    file_path = hf_hub_download(repo_id, filename)
     state_dict = torch.load(file_path, weights_only=True)
 
     # enumerate fusion layers
@@ -224,108 +235,50 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
     model.load_state_dict(state_dict)
 
-    exit()
-
-    # ----------------
+    # TODO
+    processor = ...
+    # inference_test(processor, model)
 
-    
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        # TODO
+        # print(f"Saving image processor to {pytorch_dump_folder_path}")
+        # processor.save_pretrained(pytorch_dump_folder_path)
 
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2ForImageClassification(config).eval()
-        model.dinov2.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
-            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
-            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
-            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2Model(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
 
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+    # TODO
+    # if push_to_hub:
+    #     model.push_to_hub("...")
+    #     processor.push_to_hub("...")
 
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
 
-    assert torch.allclose(original_pixel_values, pixel_values)
+"""
+- create files locally using function
+```py
+convert_depth_pro_checkpoint(
+    "apple/DepthPro",
+    "depth_pro.pt",
+    "my_local_dump",
+    False,
+)
+```
+
+- create files locally using command line args
+```cmd
+python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
+    --repo_id "apple/DepthPro" \
+    --filename "depth_pro.pt" \
+    --pytorch_dump_folder_path "my_local_dump" \
+    --push_to_hub 0
+```
+"""
 
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
 
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14": "dinov2-small",
-            "dinov2_vitb14": "dinov2-base",
-            "dinov2_vitl14": "dinov2-large",
-            "dinov2_vitg14": "dinov2-giant",
-            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
-            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
-            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
-            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"facebook/{name}")
-        processor.push_to_hub(f"facebook/{name}")
-
-
-convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False)
-exit()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+
     # Required parameters
     parser.add_argument(
         "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
@@ -341,4 +294,9 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     )
 
     args = parser.parse_args()
-    convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_depth_pro_checkpoint(
+        args.repo_id,
+        args.filename,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )

From 263b773db7ac897a6a610e15a3fc5be0b79615da Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:47:17 +0500
Subject: [PATCH 20/72] upload ruff image processing

---
 .../depth_pro/image_processing_depth_pro.py   | 397 ++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro.py

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
new file mode 100644
index 00000000000000..883c50ebfe6fbd
--- /dev/null
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -0,0 +1,397 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DepthPro."""
+
+from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+from icecream import ic
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+
+import math
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    is_torch_tensor,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+
+from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class DepthProImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+
+        # ic(image.dtype)
+        # ic(type(image))
+        # ic(image.shape)
+        # ic(image.mean())
+        # ic(image.std())
+        # ic(image.min())
+        # ic(image.max())
+        # ic(output_size)
+        # ic(resample)
+        # ic(data_format)
+        # ic(input_data_format)
+        # # exit()
+
+        # return torch.nn.functional.interpolate(
+        #     input=torch.from_numpy(image),
+        #     size=output_size,
+        #     mode=resample,
+        #     align_corners=True,
+        # )
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # TODO
+        # depth-pro image preprocessing scales the image before resizing it
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        predicted_depth,
+        fov=None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        self.size = {
+            'width': 3024,
+            'height': 2268,
+        }
+        W = self.size['width']
+        H = self.size['height']
+
+        if (fov is not None) and (len(predicted_depth) != len(fov)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        output_depths = []
+        output_fovs = None if fov is None else []
+        fov = [None] * len(predicted_depth) if fov is None else fov
+        for depth, fov_value in zip(predicted_depth, fov):
+
+            if fov_value is not None:
+                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
+                depth = depth * W / fov_value
+
+            depth = torch.nn.functional.interpolate(
+                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
+            ).squeeze()
+
+            if fov_value is not None:
+                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+                output_fovs.append(fov_value)
+
+            output_depths.append(depth)
+
+        return output_depths, output_fovs

From 17e5487ce6782998aaccb8a8799b9495d7d545bd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 09:35:52 +0500
Subject: [PATCH 21/72] create fast image processing

---
 .../image_processing_depth_pro_fast.py        | 362 ++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro_fast.py

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
new file mode 100644
index 00000000000000..8860f2e86830c0
--- /dev/null
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -0,0 +1,362 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DepthPro."""
+
+import functools
+from typing import Dict, List, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils import get_size_dict
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_type,
+    make_list_of_images,
+    pil_torch_interpolation_mapping,
+)
+from ...utils import TensorType, logging, requires_backends
+from ...utils.import_utils import is_torch_available, is_torchvision_available
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
+
+
+class DepthProImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `False`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+    _transform_params = [
+        "do_resize",
+        "do_rescale",
+        "do_normalize",
+        "size",
+        "resample",
+        "antialias",
+        "rescale_factor",
+        "image_mean",
+        "image_std",
+        "image_type",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.antialias = antialias
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def _build_transforms(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        image_type: ImageType,
+    ) -> "Compose":
+        """
+        Given the input settings build the image transforms using `torchvision.transforms.Compose`.
+        """
+        transforms = []
+
+        # All PIL and numpy values need to be converted to a torch tensor
+        # to keep cross compatibility with slow image processors
+        if image_type == ImageType.PIL:
+            transforms.append(PILToTensor())
+
+        elif image_type == ImageType.NUMPY:
+            transforms.append(NumpyToTensor())
+
+        # We can combine rescale and normalize into a single operation for speed
+        if do_rescale and do_normalize:
+            transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
+        elif do_rescale:
+            transforms.append(Rescale(rescale_factor=rescale_factor))
+        elif do_normalize:
+            transforms.append(Normalize(image_mean, image_std))
+
+		# depth-pro scales the image before resizing it
+        if do_resize:
+            transforms.append(
+                Resize(
+                    (size["height"], size["width"]),
+                    interpolation=pil_torch_interpolation_mapping[resample],
+                    antialias=antialias
+                )
+            )
+
+        return Compose(transforms)
+
+    @functools.lru_cache(maxsize=1)
+    def _validate_input_arguments(
+        self,
+        return_tensors: Union[str, TensorType],
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+        image_type: ImageType,
+    ):
+        if return_tensors != "pt":
+            raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample, antialias):
+            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        antialias: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "pt",
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+			antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Only "pt" is supported
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. The following formats are currently supported:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        antialias = antialias if antialias is not None else self.antialias
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            return_tensors=return_tensors,
+            data_format=data_format,
+            image_type=image_type,
+        )
+
+        transforms = self.get_transforms(
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
+            rescale_factor=rescale_factor,
+            image_mean=image_mean,
+            image_std=image_std,
+            image_type=image_type,
+        )
+        transformed_images = [transforms(image) for image in images]
+
+        data = {"pixel_values": torch.stack(transformed_images, dim=0)}
+        return BatchFeature(data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        predicted_depth,
+        fov=None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        self.size = {
+            'width': 3024,
+            'height': 2268,
+        }
+        W = self.size['width']
+        H = self.size['height']
+
+        if (fov is not None) and (len(predicted_depth) != len(fov)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        output_depths = []
+        output_fovs = None if fov is None else []
+        fov = [None] * len(predicted_depth) if fov is None else fov
+        for depth, fov_value in zip(predicted_depth, fov):
+
+            if fov_value is not None:
+                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
+                depth = depth * W / fov_value
+
+            depth = torch.nn.functional.interpolate(
+                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
+            ).squeeze()
+
+            if fov_value is not None:
+                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+                output_fovs.append(fov_value)
+
+            output_depths.append(depth)
+
+        return output_depths, output_fovs

From a8dd7049a5e2683a06f8d8df4cb7d22673d35b4b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 10:42:36 +0500
Subject: [PATCH 22/72] use torch interpolation for image processing

---
 .../depth_pro/image_processing_depth_pro.py   | 112 +++++++++++-------
 1 file changed, 66 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 883c50ebfe6fbd..d8b9ff493b1ab2 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
+import functools
 from typing import Dict, List, Optional, Union
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -33,7 +34,7 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
-    validate_preprocess_arguments,
+    pil_torch_interpolation_mapping,
 )
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 
@@ -62,7 +63,6 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
-    validate_preprocess_arguments,
 )
 from ...utils import (
     TensorType,
@@ -99,6 +99,9 @@ class DepthProImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `False`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
@@ -123,6 +126,7 @@ def __init__(
         do_resize: bool = True,
         size: Optional[Dict[str, int]] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -138,15 +142,17 @@ def __init__(
         self.do_normalize = do_normalize
         self.size = size
         self.resample = resample
+        self.antialias = antialias
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
     def resize(
         self,
-        image: np.ndarray,
+        images: List[np.ndarray],
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -155,12 +161,15 @@ def resize(
         Resize an image to `(size["height"], size["width"])`.
 
         Args:
-            image (`np.ndarray`):
-                Image to resize.
+            images (`List[np.ndarray]`):
+                Images to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -175,41 +184,49 @@ def resize(
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
 
         Returns:
-            `np.ndarray`: The resized image.
+            `np.ndarray`: The resized images.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        # ic(image.dtype)
-        # ic(type(image))
-        # ic(image.shape)
-        # ic(image.mean())
-        # ic(image.std())
-        # ic(image.min())
-        # ic(image.max())
-        # ic(output_size)
-        # ic(resample)
-        # ic(data_format)
-        # ic(input_data_format)
-        # # exit()
-
-        # return torch.nn.functional.interpolate(
-        #     input=torch.from_numpy(image),
-        #     size=output_size,
-        #     mode=resample,
-        #     align_corners=True,
-        # )
-
-        return resize(
-            image,
+        images = np.stack(images)
+        images = torch.from_numpy(images)
+
+        return torch.nn.functional.interpolate(
+            # input should be (B, C, H, W)
+            input=images,
             size=output_size,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
+            # mode=pil_torch_interpolation_mapping[resample],
+            mode="bilinear",
+            antialias=antialias,
+        ).numpy()
+
+    def _validate_input_arguments(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+    ):
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample, antialias):
+            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
 
     @filter_out_non_signature_kwargs()
     def preprocess(
@@ -218,6 +235,7 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
+        antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
@@ -242,6 +260,9 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
+            antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -275,6 +296,7 @@ def preprocess(
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         resample = resample if resample is not None else self.resample
+        antialias = antialias if antialias is not None else self.antialias
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
@@ -289,15 +311,17 @@ def preprocess(
                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
-        validate_preprocess_arguments(
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
+            data_format=data_format,
         )
 
         # All transformations expect numpy arrays.
@@ -313,15 +337,6 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        # TODO
-        # depth-pro image preprocessing scales the image before resizing it
-
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
         if do_rescale:
             images = [
                 self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
@@ -338,6 +353,11 @@ def preprocess(
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
 
+		# depth-pro scales the image before resizing it
+        # uses torch interpolation which requires ChannelDimension.FIRST
+        if do_resize:
+            images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
+
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 

From 261bbafe4fb65d3bfe344045d92c7ca67f05283f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 12:12:39 +0500
Subject: [PATCH 23/72] complete post_process_depth_estimation

---
 .../depth_pro/image_processing_depth_pro.py   | 71 +++++++++++--------
 .../image_processing_depth_pro_fast.py        | 70 ++++++++++--------
 2 files changed, 83 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index d8b9ff493b1ab2..0a7313e2d19a43 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,13 +14,13 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-import functools
 from typing import Dict, List, Optional, Union
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 from icecream import ic
 
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
@@ -186,6 +186,8 @@ def resize(
         Returns:
             `np.ndarray`: The resized images.
         """
+        requires_backends(self, "torch")
+
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
@@ -198,10 +200,9 @@ def resize(
             # input should be (B, C, H, W)
             input=images,
             size=output_size,
-            # mode=pil_torch_interpolation_mapping[resample],
-            mode="bilinear",
+            mode=pil_torch_interpolation_mapping[resample].value,
             antialias=antialias,
-        ).numpy()
+        )
 
     def _validate_input_arguments(
         self,
@@ -357,14 +358,16 @@ def preprocess(
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
             images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
+            images = images.numpy()
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_depth_estimation(
         self,
-        predicted_depth,
-        fov=None,
+        predicted_depths,
+        fovs=None,
+        target_sizes=None,
     ) -> List[Dict[str, TensorType]]:
         """
         Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
@@ -383,35 +386,45 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        self.size = {
-            'width': 3024,
-            'height': 2268,
-        }
-        W = self.size['width']
-        H = self.size['height']
-
-        if (fov is not None) and (len(predicted_depth) != len(fov)):
+        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
+        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        outputs = {
+            "predicted_depth": [],
+            "fov": [] if fovs is not None else None
+        }
+
+        fovs = [None] * len(predicted_depths) if fovs is None else fovs
+        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
+
+        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
 
-        output_depths = []
-        output_fovs = None if fov is None else []
-        fov = [None] * len(predicted_depth) if fov is None else fov
-        for depth, fov_value in zip(predicted_depth, fov):
+            if target_size is not None:
 
-            if fov_value is not None:
-                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
-                depth = depth * W / fov_value
+                # scale image w.r.t fov
+                if fov is not None:
+                    width = target_size[1]
+                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
+                    predicted_depth = predicted_depth * width / fov
+                    outputs["fov"].append(fov)
 
-            depth = torch.nn.functional.interpolate(
-                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
-            ).squeeze()
+                # interpolate
+                predicted_depth = self.resize(
+                    predicted_depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    resample=self.resample,
+                    antialias=self.antialias
+                ).squeeze()
 
-            if fov_value is not None:
-                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
-                output_fovs.append(fov_value)
+            # inverse the depth
+            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
 
-            output_depths.append(depth)
+            outputs["predicted_depth"].append(predicted_depth)
 
-        return output_depths, output_fovs
+        return outputs
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 8860f2e86830c0..38d699452e443a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -154,7 +154,7 @@ def _build_transforms(
         elif do_normalize:
             transforms.append(Normalize(image_mean, image_std))
 
-		# depth-pro scales the image before resizing it
+        # depth-pro scales the image before resizing it
         if do_resize:
             transforms.append(
                 Resize(
@@ -229,9 +229,9 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
-			antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+            antialias (`bool`, *optional*, defaults to `False`):
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -308,8 +308,9 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depth,
-        fov=None,
+        predicted_depths,
+        fovs=None,
+        target_sizes=None,
     ) -> List[Dict[str, TensorType]]:
         """
         Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
@@ -328,35 +329,46 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        self.size = {
-            'width': 3024,
-            'height': 2268,
-        }
-        W = self.size['width']
-        H = self.size['height']
-
-        if (fov is not None) and (len(predicted_depth) != len(fov)):
+        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        output_depths = []
-        output_fovs = None if fov is None else []
-        fov = [None] * len(predicted_depth) if fov is None else fov
-        for depth, fov_value in zip(predicted_depth, fov):
+        outputs = {
+            "predicted_depth": [],
+            "fov": [] if fovs is not None else None
+        }
+
+        fovs = [None] * len(predicted_depths) if fovs is None else fovs
+        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
+
+        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+
+            if target_size is not None:
 
-            if fov_value is not None:
-                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
-                depth = depth * W / fov_value
+                # scale image w.r.t fov
+                if fov is not None:
+                    width = target_size[1]
+                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
+                    predicted_depth = predicted_depth * width / fov
+                    outputs["fov"].append(fov)
 
-            depth = torch.nn.functional.interpolate(
-                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
-            ).squeeze()
+                # interpolate
+                predicted_depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                    antialias=self.antialias,
+                ).squeeze()
 
-            if fov_value is not None:
-                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
-                output_fovs.append(fov_value)
+            # inverse the depth
+            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
 
-            output_depths.append(depth)
+            outputs["predicted_depth"].append(predicted_depth)
 
-        return output_depths, output_fovs
+        return outputs

From a4b3556c5f7ef738048df1b7de22dfa45c822b43 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 16:36:19 +0500
Subject: [PATCH 24/72] config: fix imports and sort args

---
 .../depth_pro/configuration_depth_pro.py      | 49 +++++++++----------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index f124d3e5b71ab7..fae3e84432be22 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -14,15 +14,8 @@
 # limitations under the License.
 """DepthPro model configuration"""
 
-from collections import OrderedDict
-from typing import Mapping
-
-from packaging import version
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.onnx import OnnxConfig
-from transformers.utils import logging
-from transformers.utils.backbone_utils import get_aligned_output_features_output_indices
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -41,6 +34,8 @@ class DepthProConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
+        fusion_hidden_size
+            TODO
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
@@ -65,6 +60,8 @@ class DepthProConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
+        patch_embeddings_size
+            TODO
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
@@ -73,22 +70,28 @@ class DepthProConfig(PretrainedConfig):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
             Whether to use the SwiGLU feedforward neural network.
-        out_features (`List[str]`, *optional*):
-            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
-            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
-            same order as defined in the `stage_names` attribute.
-        out_indices (`List[int]`, *optional*):
-            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
-            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
-            If unset and `out_features` is unset, will default to the last stage. Must be in the
-            same order as defined in the `stage_names` attribute.
         apply_layernorm (`bool`, *optional*, defaults to `True`):
             Whether to apply layer normalization to the feature maps in case the model is used as backbone.
         reshape_hidden_states (`bool`, *optional*, defaults to `True`):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
+        intermediate_hook_ids
+            TODO
+        intermediate_feature_dims
+            TODO
+        scaled_images_ratios
+            TODO
+        scaled_images_overlap_ratios
+            TODO
+        scaled_images_feature_dims
+            TODO
+        use_batch_norm_in_fusion
+            TODO
+        use_fov_model
+            TODO
+        num_fov_head_layers
+            TODO
 
     Example:
 
@@ -127,8 +130,6 @@ def __init__(
         layerscale_value=1.0,
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
-        out_features=None,
-        out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
         intermediate_hook_ids = [11, 5],
@@ -137,7 +138,7 @@ def __init__(
         scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
         scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_fusion=False,
-        use_fov_model=False,
+        use_fov_model=True,
         num_fov_head_layers=2,
         **kwargs,
     ):
@@ -161,10 +162,6 @@ def __init__(
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
-        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
-        )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_fusion = use_batch_norm_in_fusion

From f13c63208caec6b70a9d8660a42d92ec4c18af3a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 16:51:12 +0500
Subject: [PATCH 25/72] apply inference in weight conversion

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 63 ++++++++++++-------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
index de7bf395a35552..7b4552c508fffe 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -18,24 +18,22 @@
 """
 
 import argparse
-import json
 from pathlib import Path
 import re
 
 import requests
 import torch
-import torch.nn as nn
 from huggingface_hub import hf_hub_download
 from PIL import Image
-from torchvision import transforms
 
-from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
+# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
 # TODO: import directly from transformers
 from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
 from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
 
 
 logging.set_verbosity_info()
@@ -147,13 +145,21 @@ def inference_test(processor, model):
 
     predicted_depth = outputs.predicted_depth
     fov = outputs.fov
+    target_sizes = [[image.height, image.width]] * len(predicted_depth)
 
-    predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov)
+    outputs = processor.post_process_depth_estimation(
+        predicted_depths=predicted_depth,
+        fovs=fov,
+        target_sizes=target_sizes,
+    )
+    predicted_depth = outputs['predicted_depth']
+    fov = outputs['fov']
 
-    print("predicted_depth.shape:", predicted_depth.shape)
-    print("fov.shape:", fov.shape)
+    print("\nInference ...")
+    print("predicted_depth:", predicted_depth)
+    print("predicted_depth[0].shape:", predicted_depth[0].shape)
     print("fov:", fov)
-    print("Inference was Successfull!")
+    print("Inference was Successfull!\n")
 
 
 @torch.no_grad()
@@ -167,6 +173,7 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 
     # load original weights from huggingface hub
     file_path = hf_hub_download(repo_id, filename)
+    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
     state_dict = torch.load(file_path, weights_only=True)
 
     # enumerate fusion layers
@@ -235,23 +242,31 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
     model.load_state_dict(state_dict)
 
-    # TODO
-    processor = ...
-    # inference_test(processor, model)
+    processor = DepthProImageProcessorFast(
+        do_resize = True,
+        size = {"height": 1536, "width": 1536},
+        resample = PILImageResampling.BILINEAR,
+        antialias = False,
+        do_rescale = True,
+        rescale_factor = 1 / 255,
+        do_normalize = True,
+        image_mean = 0.5,
+        image_std = 0.5,
+        return_tensors = "pt",
+    )
+    inference_test(processor, model)
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        # TODO
-        # print(f"Saving image processor to {pytorch_dump_folder_path}")
-        # processor.save_pretrained(pytorch_dump_folder_path)
-
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
 
-    # TODO
-    # if push_to_hub:
-    #     model.push_to_hub("...")
-    #     processor.push_to_hub("...")
+    if push_to_hub:
+        hub_path = "geetu040/DepthPro"
+        model.push_to_hub(hub_path)
+        processor.push_to_hub(hub_path)
 
 
 """
@@ -260,8 +275,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 convert_depth_pro_checkpoint(
     "apple/DepthPro",
     "depth_pro.pt",
-    "my_local_dump",
-    False,
+    "my_local_depth_pro_dump",
+    True,
 )
 ```
 
@@ -270,8 +285,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
     --repo_id "apple/DepthPro" \
     --filename "depth_pro.pt" \
-    --pytorch_dump_folder_path "my_local_dump" \
-    --push_to_hub 0
+    --pytorch_dump_folder_path "my_local_depth_pro_dump" \
+    --push_to_hub
 ```
 """
 

From 387ddd8c7e50f419d1abcd5a61cd48ea23e0d626 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 10:55:18 +0500
Subject: [PATCH 26/72] use mllama script instead for weight conversion

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 317 ------------------
 .../convert_depth_pro_weights_to_hf.py        | 255 ++++++++++++++
 2 files changed, 255 insertions(+), 317 deletions(-)
 delete mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
 create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
deleted file mode 100644
index 7b4552c508fffe..00000000000000
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DepthPro checkpoints from the original repository.
-
-URL: https://huggingface.co/apple/DepthPro/tree/main
-"""
-
-import argparse
-from pathlib import Path
-import re
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
-# TODO: import directly from transformers
-from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
-from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
-from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def create_vit_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    state_dict_keys = state_dict.keys()
-    for key in list(state_dict_keys):
-        if "qkv" in key:
-            in_proj = state_dict.pop(key)
-            q, k, v = torch.split(in_proj, config.hidden_size, dim=0)
-
-            if "fov" in key:
-                key = key.replace('fov.encoder.0', 'fov_model.encoder')
-            else:
-                key = "depth_pro." + key
-
-            key = key.replace("blocks", "encoder.layer")
-            state_dict[key.replace("attn.qkv", "attention.attention.query")] = q
-            state_dict[key.replace("attn.qkv", "attention.attention.key")] = k
-            state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
-    return state_dict
-
-
-# hard coded upsample keys
-def update_hard_coded_keys(state_dict):
-    mapping = [
-        # upsamples
-        ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'),
-        ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'),
-        ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'),
-        ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'),
-        ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'),
-        ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'),
-        ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'),
-        ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'),
-        ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'),
-        ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'),
-        ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'),
-        ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'),
-        ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'),
-        ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'),
-        ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'),
-
-        # neck
-        ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"),
-        ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"),
-        ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"),
-        ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"),
-    ]
-    for src, dest in mapping:
-        state_dict[dest] = state_dict.pop(src)
-    
-    return state_dict
-
-
-# We will verify our results on an image of cute cats
-def inference_test(processor, model):
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    inputs = processor(image)
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    predicted_depth = outputs.predicted_depth
-    fov = outputs.fov
-    target_sizes = [[image.height, image.width]] * len(predicted_depth)
-
-    outputs = processor.post_process_depth_estimation(
-        predicted_depths=predicted_depth,
-        fovs=fov,
-        target_sizes=target_sizes,
-    )
-    predicted_depth = outputs['predicted_depth']
-    fov = outputs['fov']
-
-    print("\nInference ...")
-    print("predicted_depth:", predicted_depth)
-    print("predicted_depth[0].shape:", predicted_depth[0].shape)
-    print("fov:", fov)
-    print("Inference was Successfull!\n")
-
-
-@torch.no_grad()
-def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DepthPro structure.
-    """
-
-    # define default DepthPro configuration
-    config = DepthProConfig(use_fov_model=True)
-
-    # load original weights from huggingface hub
-    file_path = hf_hub_download(repo_id, filename)
-    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
-    state_dict = torch.load(file_path, weights_only=True)
-
-    # enumerate fusion layers
-    n_scaled_images = len(config.scaled_images_ratios)       # 3
-    n_intermediate_hooks = len(config.intermediate_hook_ids) # 2
-    n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5
-
-    # 1. keys for vit encoders
-    vit_rename_keys = create_vit_rename_keys(config)
-    for src_prefix, dest_prefix in [
-        ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"),
-        ("encoder.image_encoder", "depth_pro.encoder.image_encoder"),
-        ("fov.encoder.0", "fov_model.encoder"),
-    ]:
-        for src, dest in vit_rename_keys:
-            src = src_prefix + "." + src
-            dest = dest_prefix + "." + dest
-            state_dict[dest] = state_dict.pop(src)
-
-    # 2. qkv keys for vit encoders
-    state_dict = read_in_q_k_v(state_dict, config)
-
-    # 3. hard coded mapping
-    state_dict = update_hard_coded_keys(state_dict)
-
-
-    for key in list(state_dict.keys()):
-
-        # 4. final depth estimation head
-        if key.startswith("head."):
-            new_key = "head." + key
-
-        # 5. fov model head
-        elif key.startswith("fov.head."):
-            new_key = key.replace("fov", 'fov_model')
-
-        # 6. projections between encoder and fusion
-        elif "decoder.convs." in key:
-            n = re.findall(r'\d+', key)[0] # find digit inside string
-            n = n_fusion_layers - int(n) - 1
-            new_key = f"projections.{n}.weight"
-
-        # 7. fuse low res with image features
-        elif "encoder.fuse_lowres." in key:
-            new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res")
-
-        # 8. fusion stage (decoder)
-        elif key.startswith("decoder.fusions."):
-            new_key = key.replace("decoder.fusions.", "fusion_stage.layers.")
-            new_key = new_key.replace("resnet1", "residual_layer1")
-            new_key = new_key.replace("resnet2", "residual_layer2")
-            new_key = new_key.replace("residual.1", "convolution1")
-            new_key = new_key.replace("residual.3", "convolution2")
-            new_key = new_key.replace("out_conv", "projection")
-
-            n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by .
-            n = n_with_dots[1:-1]
-            n = n_fusion_layers - int(n) - 1
-            new_key = new_key.replace(n_with_dots, f".{n}.")
-
-        else:
-            continue
-
-        state_dict[new_key] = state_dict.pop(key)        
-
-    model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
-    model.load_state_dict(state_dict)
-
-    processor = DepthProImageProcessorFast(
-        do_resize = True,
-        size = {"height": 1536, "width": 1536},
-        resample = PILImageResampling.BILINEAR,
-        antialias = False,
-        do_rescale = True,
-        rescale_factor = 1 / 255,
-        do_normalize = True,
-        image_mean = 0.5,
-        image_std = 0.5,
-        return_tensors = "pt",
-    )
-    inference_test(processor, model)
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        hub_path = "geetu040/DepthPro"
-        model.push_to_hub(hub_path)
-        processor.push_to_hub(hub_path)
-
-
-"""
-- create files locally using function
-```py
-convert_depth_pro_checkpoint(
-    "apple/DepthPro",
-    "depth_pro.pt",
-    "my_local_depth_pro_dump",
-    True,
-)
-```
-
-- create files locally using command line args
-```cmd
-python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
-    --repo_id "apple/DepthPro" \
-    --filename "depth_pro.pt" \
-    --pytorch_dump_folder_path "my_local_depth_pro_dump" \
-    --push_to_hub
-```
-"""
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
-    )
-    parser.add_argument(
-        "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_depth_pro_checkpoint(
-        args.repo_id,
-        args.filename,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
new file mode 100644
index 00000000000000..fe862d7469a1d3
--- /dev/null
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -0,0 +1,255 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gc
+import os
+
+import regex as re
+import torch
+from huggingface_hub import hf_hub_download
+from transformers.image_utils import PILImageResampling
+
+from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
+from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
+from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+
+
+# fmt: off
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+
+    # patch_encoder/image_encoder (ViT based)
+    r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2",
+
+	r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"head.(\d+).(weight|bias)": r"head.head.\1.\2",
+    r"decoder.convs.(\d+).weight": lambda match: (
+        f"projections.{4-int(match.group(1))}.weight"
+    ),
+
+    # fov_model.encoder (ViT based)
+    r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1",
+
+    # fov head
+    r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2",
+    r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1",
+    r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2",
+
+    # fusion stage
+    r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
+    ),
+    r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}"
+    ),
+    r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}"
+    ),
+
+    # qkv attentions blocks
+
+    # upsamples (hard coded; regex is not very feasible here)
+	"encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+	"encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+	"encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+	"encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+	"encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+	"encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+	"encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+	"encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+	"encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+	"encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+	"encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+	"encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+	"encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+}
+# fmt: on
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+
+def get_qkv_state_dict(key, parameter):
+    qkv_state_dict = {}
+    placeholder = re.search(r'(\(.*?\))', key).group(1)
+    replacements_keys = placeholder[1:-1].split("|")
+    replacements_vals = torch.split(
+        parameter,
+        split_size_or_sections=parameter.size(0)//len(replacements_keys),
+        dim=0
+    )
+    for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
+        qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
+    return qkv_state_dict        
+
+def write_model(
+    hf_repo_id: str,
+    output_dir: str,
+    safe_serialization: bool=True,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # ------------------------------------------------------------
+    # Create and save config
+    # ------------------------------------------------------------
+
+    # create config
+    config = DepthProConfig(
+        # this config is same as the default config and used for pre-trained weights
+        hidden_size=1024,
+        fusion_hidden_size=256,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=1536,
+        patch_size=384,
+        num_channels=3,
+        patch_embeddings_size=16,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        intermediate_hook_ids = [11, 5],
+        intermediate_feature_dims = [256, 256],
+        scaled_images_ratios = [0.25, 0.5, 1],
+        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
+        scaled_images_feature_dims = [1024, 1024, 512],
+        use_batch_norm_in_fusion=False,
+        use_fov_model=True,
+        num_fov_head_layers=2,
+    )
+
+    # save config
+    config.save_pretrained(output_dir)
+    print("Model config saved successfully...")
+
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
+
+    # downlaod and load state_dict from hf repo
+    file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
+    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
+    loaded = torch.load(file_path, weights_only=True)
+
+    print("Converting model...")
+    all_keys = list(loaded.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        current_parameter = loaded.pop(key)
+
+        if "qkv" in key:
+            qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
+            state_dict.update(qkv_state_dict)
+        else:
+            state_dict[new_key] = current_parameter
+
+    print("Loading the checkpoint in a DepthPro model.")
+    model = DepthProForDepthEstimation(config)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+
+    print("Saving the model.")
+    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+    del state_dict, model
+
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+    print("Model reloaded successfully.")
+
+def write_image_processor(output_dir: str):
+    image_processor = DepthProImageProcessorFast(
+        do_resize = True,
+        size = {"height": 1536, "width": 1536},
+        resample = PILImageResampling.BILINEAR,
+        antialias = False,
+        do_rescale = True,
+        rescale_factor = 1 / 255,
+        do_normalize = True,
+        image_mean = 0.5,
+        image_std = 0.5,
+        return_tensors = "pt",
+    )
+    image_processor.save_pretrained(output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--hf_repo_id",
+        default="apple/DepthPro",
+        help="Location of official weights from apple on HF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="apple_DepthPro",
+        help="Location to write HF model and processor",
+    )
+    parser.add_argument(
+        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+    )
+    args = parser.parse_args()
+
+    write_model(
+        hf_repo_id=args.hf_repo_id,
+        output_dir=args.output_dir,
+        safe_serialization=args.safe_serialization,
+    )
+
+    write_image_processor(
+        output_dir=args.output_dir,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 9b67f9d2afc1b081a4990149eb16ea906ce09295 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 11:09:50 +0500
Subject: [PATCH 27/72] clean weight conversion script

---
 .../convert_depth_pro_weights_to_hf.py        | 106 +++++++++---------
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index fe862d7469a1d3..0b81e8907e299e 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -29,39 +29,55 @@
 # fmt: off
 ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 
-    # patch_encoder/image_encoder (ViT based)
-    r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
-    r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
-    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
-    r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2",
-
-	r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1",
-    r"head.(\d+).(weight|bias)": r"head.head.\1.\2",
+    # encoder and head
+    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.layernorm.\2",
+	r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
+
+    # fov
+    r"fov.encoder.0.cls_token":                                                 r"fov_model.encoder.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed":                                                 r"fov_model.encoder.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.encoder.layernorm.\1",
+    r"fov.downsample.(\d+).(weight|bias)":                                      r"fov_model.global_neck.\1.\2",
+    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.encoder_neck.\1",
+    r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
+
+    # upsamples (hard coded; regex is not very feasible here)
+	"encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+	"encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+	"encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+	"encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+	"encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+	"encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+	"encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+	"encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+	"encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+	"encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+	"encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+	"encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+	"encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+
+    # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (
         f"projections.{4-int(match.group(1))}.weight"
     ),
 
-    # fov_model.encoder (ViT based)
-    r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token",
-    r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings",
-    r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
-    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
-    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
-    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
-    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
-    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
-    r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1",
-
-    # fov head
-    r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2",
-    r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1",
-    r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2",
-
     # fusion stage
     r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
         f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
@@ -72,25 +88,6 @@
     r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
         f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}"
     ),
-
-    # qkv attentions blocks
-
-    # upsamples (hard coded; regex is not very feasible here)
-	"encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-	"encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-	"encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-	"encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-	"encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-	"encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-	"encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-	"encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-	"encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-	"encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-	"encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-	"encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-	"encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
 }
 # fmt: on
 
@@ -108,9 +105,18 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
     return output_dict
 
 def get_qkv_state_dict(key, parameter):
+    """
+    new key which looks like this
+    xxxx.(q|k|v).xxx    (m, n)
+
+    is converted to
+    xxxx.q.xxxx         (m//3, n)
+    xxxx.k.xxxx         (m//3, n)
+    xxxx.v.xxxx         (m//3, n)
+    """
     qkv_state_dict = {}
-    placeholder = re.search(r'(\(.*?\))', key).group(1)
-    replacements_keys = placeholder[1:-1].split("|")
+    placeholder = re.search(r'(\(.*?\))', key).group(1) # finds   "(query|key|value)"
+    replacements_keys = placeholder[1:-1].split("|")    # creates ['query', 'key', 'value']
     replacements_vals = torch.split(
         parameter,
         split_size_or_sections=parameter.size(0)//len(replacements_keys),

From 617c872fb90d313f03fc55962088127e659241c7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 12:57:50 +0500
Subject: [PATCH 28/72] add depth-pro status in other files

---
 src/transformers/__init__.py                  | 16 +++++
 .../models/auto/configuration_auto.py         |  2 +
 .../models/auto/image_processing_auto.py      |  1 +
 src/transformers/models/auto/modeling_auto.py |  3 +
 src/transformers/models/depth_pro/__init__.py | 72 +++++++++++++++++++
 .../convert_depth_pro_weights_to_hf.py        |  8 ++-
 .../depth_pro/image_processing_depth_pro.py   |  2 -
 utils/check_docstrings.py                     |  1 +
 utils/check_repo.py                           |  1 +
 9 files changed, 101 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/depth_pro/__init__.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 47b43e0b90896f..3d0b85e3a1b424 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -408,6 +408,7 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPTConfig"],
+    "models.depth_pro": ["DepthProConfig"],
     "models.efficientnet": ["EfficientNetConfig"],
     "models.electra": [
         "ElectraConfig",
@@ -1195,6 +1196,7 @@
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
+    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
@@ -2136,6 +2138,13 @@
             "DPTPreTrainedModel",
         ]
     )
+    _import_structure["models.depth_pro"].extend(
+        [
+            "DepthProForDepthEstimation",
+            "DepthProModel",
+            "DepthProPreTrainedModel",
+        ]
+    )
     _import_structure["models.efficientnet"].extend(
         [
             "EfficientNetForImageClassification",
@@ -5272,6 +5281,7 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPTConfig
+    from .models.depth_pro import DepthProConfig
     from .models.efficientnet import (
         EfficientNetConfig,
     )
@@ -6100,6 +6110,7 @@
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
+        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
@@ -6907,6 +6918,11 @@
             DPTModel,
             DPTPreTrainedModel,
         )
+        from .models.depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
         from .models.efficientnet import (
             EfficientNetForImageClassification,
             EfficientNetModel,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 48625ea3f346cd..d8860d38f85046 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -93,6 +93,7 @@
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
+        ("depth_pro", "DepthProConfig"),
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
@@ -394,6 +395,7 @@
         ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
+        ("depth_pro", "DepthPro"),
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index a8960d80acc838..e7b53f30a7a064 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,6 +77,7 @@
             ("dinov2", ("BitImageProcessor",)),
             ("donut-swin", ("DonutImageProcessor",)),
             ("dpt", ("DPTImageProcessor",)),
+            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor",)),
             ("efficientnet", ("EfficientNetImageProcessor",)),
             ("flava", ("FlavaImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 67c539fca66496..4cc15ca4ca51c2 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -92,6 +92,7 @@
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
+        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
@@ -571,6 +572,7 @@
         ("dinat", "DinatModel"),
         ("dinov2", "Dinov2Model"),
         ("dpt", "DPTModel"),
+        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
@@ -866,6 +868,7 @@
         # Model for depth estimation mapping
         ("depth_anything", "DepthAnythingForDepthEstimation"),
         ("dpt", "DPTForDepthEstimation"),
+        ("depth_pro", "DepthProForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
         ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py
new file mode 100644
index 00000000000000..1f2a6646c5c07f
--- /dev/null
+++ b/src/transformers/models/depth_pro/__init__.py
@@ -0,0 +1,72 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable
+
+
+_import_structure = {"configuration_depth_pro": ["DepthProConfig"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_depth_pro"] = ["DepthProImageProcessor"]
+    _import_structure["image_processing_depth_pro_fast"] = ["DepthProImageProcessorFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_depth_pro"] = [
+        "DepthProForDepthEstimation",
+        "DepthProModel",
+        "DepthProPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_depth_pro import DepthProConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_depth_pro import DepthProImageProcessor
+        from .image_processing_depth_pro_fast import DepthProImageProcessorFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 0b81e8907e299e..741016e88a3d62 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -21,9 +21,11 @@
 from huggingface_hub import hf_hub_download
 from transformers.image_utils import PILImageResampling
 
-from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
-from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
-from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+from transformers import (
+    DepthProConfig,
+    DepthProImageProcessorFast,
+    DepthProForDepthEstimation,
+)
 
 
 # fmt: off
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 0a7313e2d19a43..99a7c26c98269a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -72,8 +72,6 @@
     requires_backends,
 )
 
-from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput
-
 
 if is_torch_available():
     import torch
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 0be960f4a33e6d..34deed0df47e01 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -140,6 +140,7 @@
     "DPRReaderTokenizer",
     "DPRReaderTokenizerFast",
     "DPTModel",
+    "DepthProModel",
     "Data2VecAudioConfig",
     "Data2VecTextConfig",
     "Data2VecTextModel",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 10be5cdcd26230..2e131e8791530e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -213,6 +213,7 @@
     "JukeboxPrior",
     "SamModel",
     "DPTForDepthEstimation",
+    "DepthProForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
     "ViltForImagesAndTextClassification",

From 6e1c512b15474979ea3176e85214ccc70fcc6cd7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 13:33:25 +0500
Subject: [PATCH 29/72] fill docstring in config

---
 .../depth_pro/configuration_depth_pro.py      | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index fae3e84432be22..9b53288c41ed08 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -34,8 +34,8 @@ class DepthProConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        fusion_hidden_size
-            TODO
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
@@ -53,15 +53,17 @@ class DepthProConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 224):
-            TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size
-            The size (resolution) of each image.
+        image_size (`int`, *optional*, defaults to 1536):
+            The size (resolution) of each image,
+            To generate depth of same size as image,
+            image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
+            where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        patch_embeddings_size
-            TODO
+        patch_embeddings_size (`int`, *optional*, defaults to 16):
+            kernel_size and stride for convolution in PatchEmbeddings.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
@@ -77,21 +79,21 @@ class DepthProConfig(PretrainedConfig):
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
         intermediate_hook_ids
-            TODO
+            Indices of the intermediate hidden states from patch_encoder to use for fusion.
         intermediate_feature_dims
-            TODO
+            Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids.
         scaled_images_ratios
-            TODO
+            Use images of these ratios for patch_encoder.
         scaled_images_overlap_ratios
-            TODO
+            Overlap ratio between patches for each scaled image in scaled_image_ratios.
         scaled_images_feature_dims
-            TODO
+            Hidden state during upsampling for each scaled image in scaled_images_ratios.
         use_batch_norm_in_fusion
-            TODO
+            Whether to use batch normalization in the residual units of the fusion blocks.
         use_fov_model
-            TODO
+            Whether to use `DepthProFOVModel` to generate Field of View.
         num_fov_head_layers
-            TODO
+            No of convolution layers in head of `DepthProFOVModel`.
 
     Example:
 

From 12ee607e5d319a488d7e807a75927cb86f463cec Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 18:47:53 +0500
Subject: [PATCH 30/72] formatting

---
 .../depth_pro/configuration_depth_pro.py      |  2 +-
 .../convert_depth_pro_weights_to_hf.py        | 28 ++++-----
 .../depth_pro/image_processing_depth_pro.py   | 48 +++++++++------
 .../image_processing_depth_pro_fast.py        | 40 ++++++++-----
 .../models/depth_pro/modeling_depth_pro.py    | 58 ++++++-------------
 5 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 9b53288c41ed08..8bab8227be7ec7 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -67,7 +67,7 @@ class DepthProConfig(PretrainedConfig):
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
-           Initial value to use for layer scale.
+            Initial value to use for layer scale.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 741016e88a3d62..c3b77f17f04c69 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -41,7 +41,7 @@
     r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
     r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
     r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.layernorm.\2",
-	r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
     r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
 
     # fov
@@ -59,19 +59,19 @@
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
     # upsamples (hard coded; regex is not very feasible here)
-	"encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-	"encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-	"encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-	"encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-	"encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-	"encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-	"encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-	"encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-	"encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-	"encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-	"encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-	"encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-	"encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
     "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
     "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 99a7c26c98269a..0e3c7d6455b07f 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -166,8 +166,8 @@ def resize(
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
             antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -260,8 +260,8 @@ def preprocess(
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
             antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -352,7 +352,7 @@ def preprocess(
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
 
-		# depth-pro scales the image before resizing it
+        # depth-pro scales the image before resizing it
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
             images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
@@ -363,24 +363,36 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths,
-        fovs=None,
-        target_sizes=None,
-    ) -> List[Dict[str, TensorType]]:
+        predicted_depths: Union[TensorType, List[TensorType]],
+        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
         """
-        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
-        Only supports PyTorch.
+        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
+        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
+        and adjusts depth values accordingly.
 
         Args:
-            outputs ([`DepthEstimatorOutput`]):
-                Raw outputs of the model.
-            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            predicted_depths (`Union[TensorType, List[TensorType]]`):
+                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
+                corresponding to an image in the batch.
+            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
+                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
+                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
 
         Returns:
-            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            `Dict[str, List[TensorType]]`:
+                A dictionary containing:
+                    - `"predicted_depth"`: A list of processed depth tensors.
+                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
         """
         requires_backends(self, "torch")
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 38d699452e443a..3af05df3ccb886 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,7 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Tuple
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -308,24 +308,36 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths,
-        fovs=None,
-        target_sizes=None,
-    ) -> List[Dict[str, TensorType]]:
+        predicted_depths: Union[TensorType, List[TensorType]],
+        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
         """
-        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
-        Only supports PyTorch.
+        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
+        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
+        and adjusts depth values accordingly.
 
         Args:
-            outputs ([`DepthEstimatorOutput`]):
-                Raw outputs of the model.
-            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            predicted_depths (`Union[TensorType, List[TensorType]]`):
+                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
+                corresponding to an image in the batch.
+            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
+                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
+                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
 
         Returns:
-            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            `Dict[str, List[TensorType]]`:
+                A dictionary containing:
+                    - `"predicted_depth"`: A list of processed depth tensors.
+                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
         """
         requires_backends(self, "torch")
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index eb8bf02f83d160..b184b5985ba18c 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -44,6 +44,13 @@
 
 logger = logging.get_logger(__name__)
 
+# General docstring
+_CONFIG_FOR_DOC = "DepthProConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "geetu040/DepthPro"
+_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]
+
 
 # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
@@ -942,7 +949,7 @@ def forward(
         # STEP 8: return these features in order of increasing size as what fusion expects
         last_hidden_state = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
-            *scaled_images_features, 
+            *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
@@ -1049,14 +1056,7 @@ class PreTrainedModel
             self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # TODO
-    # @add_code_sample_docstrings(
-    #     checkpoint=_CHECKPOINT_FOR_DOC,
-    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
-    #     config_class=_CONFIG_FOR_DOC,
-    #     modality="vision",
-    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
-    # )
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -1065,6 +1065,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+        TODO
+        ```python
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1399,7 +1406,7 @@ def __init__(self, config, use_fov_model=None):
 
 
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -1418,37 +1425,6 @@ def forward(
         Examples:
         TODO
         ```python
-        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
-        >>> import torch
-        >>> import numpy as np
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
-        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
-
-        >>> # prepare image for the model
-        >>> inputs = image_processor(images=image, return_tensors="pt")
-
-        >>> with torch.no_grad():
-        ...     outputs = model(**inputs)
-        ...     predicted_depth = outputs.predicted_depth
-
-        >>> # interpolate to original size
-        >>> prediction = torch.nn.functional.interpolate(
-        ...     predicted_depth.unsqueeze(1),
-        ...     size=image.size[::-1],
-        ...     mode="bicubic",
-        ...     align_corners=False,
-        ... )
-
-        >>> # visualize the prediction
-        >>> output = prediction.squeeze().cpu().numpy()
-        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
-        >>> depth = Image.fromarray(formatted)
         ```"""
         loss = None
         if labels is not None:

From d0a8733f275941adb827a4f7e3850c2a28d66006 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:25:43 +0500
Subject: [PATCH 31/72] more formatting

---
 .../models/depth_pro/image_processing_depth_pro.py         | 7 +++----
 src/transformers/models/depth_pro/modeling_depth_pro.py    | 7 +------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 0e3c7d6455b07f..21810bfab64573 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -15,14 +15,13 @@
 """Image processor class for DepthPro."""
 
 from typing import Dict, List, Optional, Union
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-from icecream import ic
 
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format
+from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -39,7 +38,7 @@
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 
 import math
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index b184b5985ba18c..3812f678b43fb9 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -14,23 +14,18 @@
 # limitations under the License.
 """PyTorch DepthPro model."""
 
-from icecream import ic
-
-import collections.abc
 import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from dataclasses import dataclass
 
-from ...utils import ModelOutput
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutput, DepthEstimatorOutput
 )
 from ...utils import (
-    add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,

From e6b385a9edf92a5c7f342935d75ae3e017fe122c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:45:20 +0500
Subject: [PATCH 32/72] formatting with ruff

---
 .../convert_depth_pro_weights_to_hf.py        |  6 +--
 .../depth_pro/image_processing_depth_pro.py   | 39 ++-----------------
 .../image_processing_depth_pro_fast.py        |  5 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 10 ++---
 4 files changed, 13 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index c3b77f17f04c69..66dfff12065a70 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -19,13 +19,13 @@
 import regex as re
 import torch
 from huggingface_hub import hf_hub_download
-from transformers.image_utils import PILImageResampling
 
 from transformers import (
     DepthProConfig,
-    DepthProImageProcessorFast,
     DepthProForDepthEstimation,
+    DepthProImageProcessorFast,
 )
+from transformers.image_utils import PILImageResampling
 
 
 # fmt: off
@@ -126,7 +126,7 @@ def get_qkv_state_dict(key, parameter):
     )
     for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
         qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
-    return qkv_state_dict        
+    return qkv_state_dict
 
 def write_model(
     hf_repo_id: str,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 21810bfab64573..6c9c7f94e2265c 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,12 +14,10 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-from typing import Dict, List, Optional, Union
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
@@ -30,43 +28,15 @@
     PILImageResampling,
     infer_channel_dimension_format,
     is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-    pil_torch_interpolation_mapping,
-)
-from ...utils import TensorType, filter_out_non_signature_kwargs, logging
-
-import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
-
-
-if TYPE_CHECKING:
-    from ...modeling_outputs import DepthEstimatorOutput
-
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import pad, resize, to_channel_dimension_format
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_scaled_image,
     is_torch_available,
-    is_torch_tensor,
     make_list_of_images,
+    pil_torch_interpolation_mapping,
     to_numpy_array,
     valid_images,
 )
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
-    is_vision_available,
     logging,
     requires_backends,
 )
@@ -75,9 +45,6 @@
 if is_torch_available():
     import torch
 
-if is_vision_available():
-    import PIL
-
 
 logger = logging.get_logger(__name__)
 
@@ -379,7 +346,7 @@ def post_process_depth_estimation(
                 Field of view (FoV) values corresponding to each depth prediction. Should have the same length
                 as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
             target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
-                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 3af05df3ccb886..46b502d7d26f2c 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,7 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Union
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -35,6 +35,7 @@
 from ...utils import TensorType, logging, requires_backends
 from ...utils.import_utils import is_torch_available, is_torchvision_available
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -325,7 +326,7 @@ def post_process_depth_estimation(
                 Field of view (FoV) values corresponding to each depth prediction. Should have the same length
                 as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
             target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
-                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3812f678b43fb9..5b521cfda9bd3e 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -15,16 +15,16 @@
 """PyTorch DepthPro model."""
 
 import math
+from dataclasses import dataclass
 from typing import List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
-from dataclasses import dataclass
 
 from ...activations import ACT2FN
-from ...modeling_outputs import (
-    BaseModelOutput, DepthEstimatorOutput
-)
+from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -32,8 +32,6 @@
     replace_return_docstrings,
     torch_int,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from .configuration_depth_pro import DepthProConfig
 
 

From 267e50fbe2288de71428776adebaea51b902751c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:46:50 +0500
Subject: [PATCH 33/72] formatting with style

---
 src/transformers/__init__.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3d0b85e3a1b424..0e6c48762a853c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5262,6 +5262,7 @@
         XLMProphetNetConfig,
     )
     from .models.depth_anything import DepthAnythingConfig
+    from .models.depth_pro import DepthProConfig
     from .models.detr import DetrConfig
     from .models.dinat import DinatConfig
     from .models.dinov2 import Dinov2Config
@@ -5281,7 +5282,6 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPTConfig
-    from .models.depth_pro import DepthProConfig
     from .models.efficientnet import (
         EfficientNetConfig,
     )
@@ -6107,10 +6107,10 @@
         from .models.deprecated.efficientformer import EfficientFormerImageProcessor
         from .models.deprecated.tvlt import TvltImageProcessor
         from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
+        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
-        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
@@ -6872,6 +6872,11 @@
             DepthAnythingForDepthEstimation,
             DepthAnythingPreTrainedModel,
         )
+        from .models.depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
         from .models.detr import (
             DetrForObjectDetection,
             DetrForSegmentation,
@@ -6918,11 +6923,6 @@
             DPTModel,
             DPTPreTrainedModel,
         )
-        from .models.depth_pro import (
-            DepthProForDepthEstimation,
-            DepthProModel,
-            DepthProPreTrainedModel,
-        )
         from .models.efficientnet import (
             EfficientNetForImageClassification,
             EfficientNetModel,

From a1ec99743563ae054ae159a7d83dc76e9c09a4ab Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 28 Nov 2024 00:48:06 +0500
Subject: [PATCH 34/72] fix copied classes

---
 .../depth_pro/configuration_depth_pro.py      |  48 ++--
 .../convert_depth_pro_weights_to_hf.py        |  44 ++--
 .../depth_pro/image_processing_depth_pro.py   |   9 +-
 .../image_processing_depth_pro_fast.py        |   9 +-
 .../models/depth_pro/modeling_depth_pro.py    | 225 ++++++++++--------
 5 files changed, 174 insertions(+), 161 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 8bab8227be7ec7..d938f0a721f1ae 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -78,22 +78,22 @@ class DepthProConfig(PretrainedConfig):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
-        intermediate_hook_ids
-            Indices of the intermediate hidden states from patch_encoder to use for fusion.
-        intermediate_feature_dims
-            Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids.
-        scaled_images_ratios
-            Use images of these ratios for patch_encoder.
-        scaled_images_overlap_ratios
-            Overlap ratio between patches for each scaled image in scaled_image_ratios.
-        scaled_images_feature_dims
-            Hidden state during upsampling for each scaled image in scaled_images_ratios.
-        use_batch_norm_in_fusion
-            Whether to use batch normalization in the residual units of the fusion blocks.
-        use_fov_model
-            Whether to use `DepthProFOVModel` to generate Field of View.
-        num_fov_head_layers
-            No of convolution layers in head of `DepthProFOVModel`.
+        intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
+            Indices of the intermediate hidden states from the patch encoder to use for fusion.
+        intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
+            Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`.
+        scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`):
+            Ratios of scaled images to be used by the patch encoder.
+        scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`):
+            Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
+        scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`):
+            Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+        num_fov_head_layers (`int`, *optional*, defaults to `2`):
+            Number of convolution layers in the head of `DepthProFOVModel`.
 
     Example:
 
@@ -134,12 +134,13 @@ def __init__(
         use_swiglu_ffn=False,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        intermediate_hook_ids = [11, 5],
-        intermediate_feature_dims = [256, 256],
-        scaled_images_ratios = [0.25, 0.5, 1],
-        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
-        scaled_images_feature_dims = [1024, 1024, 512],
-        use_batch_norm_in_fusion=False,
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
         use_fov_model=True,
         num_fov_head_layers=2,
         **kwargs,
@@ -166,7 +167,8 @@ def __init__(
         self.use_swiglu_ffn = use_swiglu_ffn
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.use_batch_norm_in_fusion = use_batch_norm_in_fusion
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
         self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 66dfff12065a70..377595b746aca5 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -93,6 +93,7 @@
 }
 # fmt: on
 
+
 def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
     output_dict = {}
     if state_dict_keys is not None:
@@ -106,6 +107,7 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
         output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
     return output_dict
 
+
 def get_qkv_state_dict(key, parameter):
     """
     new key which looks like this
@@ -117,21 +119,20 @@ def get_qkv_state_dict(key, parameter):
     xxxx.v.xxxx         (m//3, n)
     """
     qkv_state_dict = {}
-    placeholder = re.search(r'(\(.*?\))', key).group(1) # finds   "(query|key|value)"
-    replacements_keys = placeholder[1:-1].split("|")    # creates ['query', 'key', 'value']
+    placeholder = re.search(r"(\(.*?\))", key).group(1)  # finds   "(query|key|value)"
+    replacements_keys = placeholder[1:-1].split("|")  # creates ['query', 'key', 'value']
     replacements_vals = torch.split(
-        parameter,
-        split_size_or_sections=parameter.size(0)//len(replacements_keys),
-        dim=0
+        parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
     )
     for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
         qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
     return qkv_state_dict
 
+
 def write_model(
     hf_repo_id: str,
     output_dir: str,
-    safe_serialization: bool=True,
+    safe_serialization: bool = True,
 ):
     os.makedirs(output_dir, exist_ok=True)
 
@@ -162,11 +163,11 @@ def write_model(
         use_swiglu_ffn=False,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        intermediate_hook_ids = [11, 5],
-        intermediate_feature_dims = [256, 256],
-        scaled_images_ratios = [0.25, 0.5, 1],
-        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
-        scaled_images_feature_dims = [1024, 1024, 512],
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
         use_batch_norm_in_fusion=False,
         use_fov_model=True,
         num_fov_head_layers=2,
@@ -215,18 +216,19 @@ def write_model(
     DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
     print("Model reloaded successfully.")
 
+
 def write_image_processor(output_dir: str):
     image_processor = DepthProImageProcessorFast(
-        do_resize = True,
-        size = {"height": 1536, "width": 1536},
-        resample = PILImageResampling.BILINEAR,
-        antialias = False,
-        do_rescale = True,
-        rescale_factor = 1 / 255,
-        do_normalize = True,
-        image_mean = 0.5,
-        image_std = 0.5,
-        return_tensors = "pt",
+        do_resize=True,
+        size={"height": 1536, "width": 1536},
+        resample=PILImageResampling.BILINEAR,
+        antialias=False,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=0.5,
+        image_std=0.5,
+        return_tensors="pt",
     )
     image_processor.save_pretrained(output_dir)
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 6c9c7f94e2265c..15a33f804d145a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -371,18 +371,13 @@ def post_process_depth_estimation(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {
-            "predicted_depth": [],
-            "fov": [] if fovs is not None else None
-        }
+        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
 
         fovs = [None] * len(predicted_depths) if fovs is None else fovs
         target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
 
         for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
-
             if target_size is not None:
-
                 # scale image w.r.t fov
                 if fov is not None:
                     width = target_size[1]
@@ -395,7 +390,7 @@ def post_process_depth_estimation(
                     predicted_depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     resample=self.resample,
-                    antialias=self.antialias
+                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 46b502d7d26f2c..374d5c25cafc9e 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -161,7 +161,7 @@ def _build_transforms(
                 Resize(
                     (size["height"], size["width"]),
                     interpolation=pil_torch_interpolation_mapping[resample],
-                    antialias=antialias
+                    antialias=antialias,
                 )
             )
 
@@ -351,18 +351,13 @@ def post_process_depth_estimation(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {
-            "predicted_depth": [],
-            "fov": [] if fovs is not None else None
-        }
+        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
 
         fovs = [None] * len(predicted_depths) if fovs is None else fovs
         target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
 
         for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
-
             if target_size is not None:
-
                 # scale image w.r.t fov
                 if fov is not None:
                     width = target_size[1]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 5b521cfda9bd3e..77983933a19add 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -40,17 +40,11 @@
 # General docstring
 _CONFIG_FOR_DOC = "DepthProConfig"
 
-# Base docstring
-_CHECKPOINT_FOR_DOC = "geetu040/DepthPro"
-_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]
 
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
     """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
+    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
+    with addition of config parameter patch_embeddings_size
     """
 
     def __init__(self, config):
@@ -60,6 +54,7 @@ def __init__(self, config):
         self.in_channels = config.num_channels
         self.out_channels = config.hidden_size
         self.patch_embeddings_size = config.patch_embeddings_size
+        self.num_channels = config.num_channels
 
         self.projection = nn.Conv2d(
             self.in_channels,
@@ -68,9 +63,10 @@ def __init__(self, config):
             stride=(self.patch_embeddings_size, self.patch_embeddings_size),
         )
 
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.config.num_channels:
+        if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -79,11 +75,10 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings
-# with DepthProViT->DepthProViT and antialias=True in interpolation
 class DepthProViTEmbeddings(nn.Module):
     """
-    Construct the CLS token, position and patch embeddings.
+    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings
+    except antialias=True in interpolation and removal of mask_token
     """
 
     def __init__(self, config: DepthProConfig) -> None:
@@ -131,7 +126,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
             size=(new_height, new_width),
             mode="bicubic",
             align_corners=False,
-            antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings
+            antialias=True,  # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProViTPatchEmbeddings
         ).to(dtype=target_dtype)
 
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
@@ -155,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro
 class DepthProViTSelfAttention(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -216,7 +211,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
@@ -226,8 +221,9 @@ def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "DepthProViTModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -257,7 +253,7 @@ def forward(
         return context_layer, None
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSelfOutput(nn.Module):
     """
     The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the
@@ -276,7 +272,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -316,14 +312,14 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSdpaAttention(DepthProViTAttention):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention = DepthProViTSdpaSelfAttention(config)
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2LayerScale with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTLayerScale(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -369,7 +365,7 @@ def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthPro
 class DepthProViTMLP(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -389,7 +385,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return hidden_state
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthPro
 class DepthProViTSwiGLUFFN(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -413,7 +409,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
@@ -465,7 +461,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -569,14 +565,14 @@ def forward(
 
 class DepthProUpsampleBlock(nn.Module):
     def __init__(
-            self,
-            input_dims,
-            intermediate_dims,
-            output_dims,
-            n_upsample_layers,
-            use_proj=True,
-            bias=False,
-        ) -> None:
+        self,
+        input_dims,
+        intermediate_dims,
+        output_dims,
+        n_upsample_layers,
+        use_proj=True,
+        bias=False,
+    ) -> None:
         super().__init__()
 
         # create first projection block
@@ -620,6 +616,7 @@ def interpolate(pixel_values, scale_factor):
         align_corners=False,
     )
 
+
 def patch(pixel_values, patch_size, overlap_ratio):
     """Creates Patches from Batch."""
     B, C, W, H = pixel_values.shape
@@ -631,9 +628,7 @@ def patch(pixel_values, patch_size, overlap_ratio):
     stride = int(patch_size * (1 - overlap_ratio))
 
     # (B, C, W, H)
-    patches = torch.nn.functional.unfold(
-        pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
-    )
+    patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
     # patches.shape (B, patch_size**2 * C, num_patches)
     patches = patches.permute(2, 0, 1)
     # patches.shape (num_patches, B, patch_size**2 * C)
@@ -642,11 +637,12 @@ def patch(pixel_values, patch_size, overlap_ratio):
 
     return patches
 
+
 def reshape_feature(hidden_states, width, height):
     """Discard class token and reshape 1D feature map to a 2D grid."""
     B, _, C = hidden_states.shape
     # (B, WH+1, C)
-    hidden_states = hidden_states[:, 1:, :] # remove class token
+    hidden_states = hidden_states[:, 1:, :]  # remove class token
     # (B, WH, C)
     hidden_states = hidden_states.reshape(B, width, height, C)
     # (B, W, H, C)
@@ -654,6 +650,7 @@ def reshape_feature(hidden_states, width, height):
     # (B, C, W, H)
     return hidden_states
 
+
 def merge(patches, batch_size, merge_out_size):
     """Recreates Batch from Patches."""
     num_patches, num_channels, out_size, out_size = patches.shape
@@ -668,7 +665,7 @@ def merge(patches, batch_size, merge_out_size):
     merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
     padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
     """
-    padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 )
+    padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2)
 
     i = 0
     boxes = []
@@ -685,10 +682,10 @@ def merge(patches, batch_size, merge_out_size):
                 box = box[..., :, padding:]
             if h != box_size - 1:
                 # remove pad from height if box is not at bottom border
-                box = box[..., :box.shape[-2]-padding, :]
+                box = box[..., : box.shape[-2] - padding, :]
             if w != box_size - 1:
                 # remove pad from width if box is not at right border
-                box = box[..., :, :box.shape[-1]-padding]
+                box = box[..., :, : box.shape[-1] - padding]
 
             boxes_in_row.append(box)
             i += 1
@@ -717,13 +714,12 @@ def __init__(self, config: DepthProConfig) -> None:
         self.n_scaled_images = len(self.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.intermediate_hook_ids)
         self.out_size = config.patch_size // config.patch_embeddings_size
-        self.seq_len = self.out_size ** 2 # each patch is flattened
+        self.seq_len = self.out_size**2  # each patch is flattened
 
         # config.scaled_images_ratios is sorted
         if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
             raise ValueError(
-                f"Values in scaled_images_ratios={config.scaled_images_ratios} "
-                "should be sorted from low to high"
+                f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high"
             )
 
         # lowest image resolution is greator than the patch_size
@@ -767,7 +763,7 @@ def __init__(self, config: DepthProConfig) -> None:
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=2+i,
+                n_upsample_layers=2 + i,
             )
             self.upsample_intermediate.append(upsample_block)
 
@@ -783,7 +779,7 @@ def __init__(self, config: DepthProConfig) -> None:
 
         # for STEP 7: fuse low_res and image features
         self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=config.scaled_images_feature_dims[0]*2,
+            in_channels=config.scaled_images_feature_dims[0] * 2,
             out_channels=config.scaled_images_feature_dims[0],
             kernel_size=1,
             stride=1,
@@ -838,7 +834,7 @@ def forward(
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
         scaled_images_num_patches = [len(i) for i in scaled_images]
-        patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first
+        patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
         # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
@@ -847,16 +843,15 @@ def forward(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=True, # required for intermediate features
+            output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
         scaled_images_last_hidden_state = torch.split_with_sizes(
-            patch_encodings.last_hidden_state,
-            scaled_images_num_patches[::-1]
-        )[::-1] # -1 as patch encoder expects high res patches first
+            patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
+        )[::-1]  # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0], # provide least resolution image
+            pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -874,12 +869,12 @@ def forward(
             # b. reshape back to image like
             features = reshape_feature(
                 hidden_state, self.out_size, self.out_size
-            ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
+            )  # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size*2**i
-            ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
+                features, batch_size=B, merge_out_size=self.out_size * 2**i
+            )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
             # d. upsample
             features = self.upsample_scaled_images[i](features)
@@ -891,11 +886,14 @@ def forward(
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
-
             # a. extract hidden_state
-            layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well
+            layer_id = (
+                self.intermediate_hook_ids[i] + 1
+            )  # +1 to correct index position as hidden_states contain embedding output as well
             hidden_state = patch_encodings.hidden_states[layer_id]
-            hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution
+            hidden_state = hidden_state[
+                : scaled_images_num_patches[-1]
+            ]  # num_patches to be of same length as highest resolution
             # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
@@ -903,12 +901,14 @@ def forward(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
+            )  # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
-            ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+                features,
+                batch_size=B,
+                merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
+            )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
             # d. upsample
             features = self.upsample_intermediate[i](features)
@@ -919,20 +919,26 @@ def forward(
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        hidden_state = (
+            image_encodings.last_hidden_state
+        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
         image_features = merge(
-            image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
-        ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+            image_features,
+            batch_size=B,
+            merge_out_size=self.out_size * 2 ** (0),
+        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
+        image_features = self.upsample_image(
+            image_features
+        )  # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
 
         # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
         # fuses image_features with lowest resolution features as they are of same size
@@ -1089,37 +1095,49 @@ def forward(
         return encodings
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
-class DepthProResidualLayer(nn.Module):
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
+class DepthProPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[DepthProConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
     def __init__(self, config):
         super().__init__()
 
-        self.use_batch_norm = config.use_batch_norm_in_fusion
-        self.hidden_size = config.fusion_hidden_size
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
 
         self.activation1 = nn.ReLU()
         self.convolution1 = nn.Conv2d(
-            self.hidden_size,
-            self.hidden_size,
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
             kernel_size=3,
             stride=1,
             padding=1,
-            bias=(not self.use_batch_norm),
+            bias=use_bias_in_fusion_residual,
         )
 
         self.activation2 = nn.ReLU()
         self.convolution2 = nn.Conv2d(
-            self.hidden_size,
-            self.hidden_size,
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
             kernel_size=3,
             stride=1,
             padding=1,
-            bias=(not self.use_batch_norm),
+            bias=use_bias_in_fusion_residual,
         )
 
         if self.use_batch_norm:
-            self.batch_norm1 = nn.BatchNorm2d(self.hidden_size)
-            self.batch_norm2 = nn.BatchNorm2d(self.hidden_size)
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         residual = hidden_state
@@ -1139,15 +1157,16 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return hidden_state + residual
 
 
-# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs)
 class DepthProFeatureFusionLayer(nn.Module):
-    def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         super().__init__()
         self.config = config
         self.use_deconv = use_deconv
 
-        self.residual_layer1 = DepthProResidualLayer(config)
-        self.residual_layer2 = DepthProResidualLayer(config)
+        self.residual_layer1 = DepthProPreActResidualLayer(config)
+        self.residual_layer2 = DepthProPreActResidualLayer(config)
 
         if self.use_deconv:
             self.deconv = nn.ConvTranspose2d(
@@ -1174,13 +1193,14 @@ def forward(self, hidden_state, residual=None):
         return hidden_state
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters
+# Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# with extra layer parameters, deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
     def __init__(self, config, num_layers):
         super().__init__()
         self.num_layers = num_layers
         self.layers = nn.ModuleList()
-        for _ in range(self.num_layers-1):
+        for _ in range(self.num_layers - 1):
             self.layers.append(DepthProFeatureFusionLayer(config))
         # final layer doesnot require deconvolution
         self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
@@ -1214,7 +1234,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True)
+            nn.ReLU(True),
         )
 
         if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
@@ -1227,19 +1247,21 @@ def __init__(self, config: DepthProConfig) -> None:
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
             self.head.append(
-                nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+                nn.Conv2d(
+                    self.fusion_hidden_size // 2 ** (i + 1),
+                    self.fusion_hidden_size // 2 ** (i + 2),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1)
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
-                in_channels=final_in_channels,
-                out_channels=1,
-                kernel_size=final_kernal_size,
-                stride=1,
-                padding=0
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0
             )
         )
 
@@ -1263,7 +1285,7 @@ def forward(
         # follow the steps same as with image features in DepthProEncoder
         pixel_values = interpolate(
             pixel_values,
-            scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image
+            scale_factor=self.config.scaled_images_ratios[0],  # same ratio as lowest ratioed image
         )
         patches = patch(
             pixel_values,
@@ -1279,11 +1301,7 @@ def forward(
         )
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.encoder_neck(last_hidden_state)
-        last_hidden_state = reshape_feature(
-            last_hidden_state,
-            width=self.out_size,
-            height=self.out_size
-        )
+        last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
         last_hidden_state = merge(
             last_hidden_state,
             batch_size=B,
@@ -1321,12 +1339,11 @@ def __init__(self, config):
 
         features = config.fusion_hidden_size
         self.head = nn.Sequential(
-            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
-                in_channels=features//2, out_channels=features//2,
-                kernel_size=2, stride=2, padding=0, bias=True
+                in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True
             ),
-            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
             nn.ReLU(True),
             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
             nn.ReLU(),
@@ -1347,6 +1364,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
         fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
+
     fov: Optional[torch.FloatTensor] = None
 
 
@@ -1369,7 +1387,7 @@ def __init__(self, config, use_fov_model=None):
         combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
         self.projections = nn.ModuleList()
         for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size:
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
                 # projection for last layer can be ignored if input and output channels already match
                 self.projections.append(nn.Identity())
             else:
@@ -1397,7 +1415,6 @@ def __init__(self, config, use_fov_model=None):
         # Initialize weights and apply final processing
         self.post_init()
 
-
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1454,7 +1471,9 @@ def forward(
             )
             fov = fov_encodings.last_hidden_state
             attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            hidden_states = (
+                depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            )
         else:
             fov = None
             attentions = depth_pro_outputs.attentions

From 3c656f24a5e33fed84663f2c0d45053b2b3c4e91 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 28 Nov 2024 01:29:54 +0500
Subject: [PATCH 35/72] add examples; update weight convert script

---
 .../convert_depth_pro_weights_to_hf.py        |  4 +-
 .../models/depth_pro/modeling_depth_pro.py    | 58 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 377595b746aca5..cd06a99c5fb2b4 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -168,7 +168,8 @@ def write_model(
         scaled_images_ratios=[0.25, 0.5, 1],
         scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
         scaled_images_feature_dims=[1024, 1024, 512],
-        use_batch_norm_in_fusion=False,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
         use_fov_model=True,
         num_fov_head_layers=2,
     )
@@ -228,7 +229,6 @@ def write_image_processor(output_dir: str):
         do_normalize=True,
         image_mean=0.5,
         image_std=0.5,
-        return_tensors="pt",
     )
     image_processor.save_pretrained(output_dir)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 77983933a19add..255174de09934b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1068,8 +1068,34 @@ def forward(
         Returns:
 
         Examples:
-        TODO
+
         ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, DepthProModel
+        >>>
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>>
+        >>> checkpoint = "geetu040/DepthPro"
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProModel.from_pretrained(checkpoint)
+        >>>
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>>
+        >>> with torch.no_grad():
+        ...     output = model(**inputs)
+        ...
+        >>> for state in output.last_hidden_state:
+        ...     print(state.shape)
+        ...
+        torch.Size([1, 1024, 48, 48])
+        torch.Size([1, 1024, 96, 96])
+        torch.Size([1, 512, 192, 192])
+        torch.Size([1, 256, 384, 384])
+        torch.Size([1, 256, 768, 768])
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1433,8 +1459,36 @@ def forward(
         Returns:
 
         Examples:
-        TODO
+
         ```python
+        >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>>
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>>
+        >>> checkpoint = "geetu040/DepthPro"
+        >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+        >>>
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>>
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...
+        >>> # interpolate to original size
+        >>> post_processed_output = processor.post_process_depth_estimation(
+        ...     outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)],
+        ... )
+        >>>
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output["predicted_depth"][0]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
         ```"""
         loss = None
         if labels is not None:

From f6f6d3d130b97519b8f9bf0ae9413301f655ecd9 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 10:08:56 +0500
Subject: [PATCH 36/72] fix using check_table.py and isort

---
 docs/source/en/index.md                       |  1 +
 src/transformers/__init__.py                  | 18 ++++++++--------
 .../models/auto/configuration_auto.py         |  4 ++--
 .../models/auto/image_processing_auto.py      |  2 +-
 src/transformers/models/auto/modeling_auto.py |  6 +++---
 .../models/gemma/configuration_gemma.py       |  1 -
 src/transformers/utils/dummy_pt_objects.py    | 21 +++++++++++++++++++
 .../utils/dummy_vision_objects.py             | 14 +++++++++++++
 8 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index aaff45ab65dfb6..d316e89ce6f45d 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -117,6 +117,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
 |                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
 |                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
+|                     [DepthPro](model_doc/depth_pro)                      |       ✅        |         ❌         |      ❌      |
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0e6c48762a853c..d4ac4b5fd866fa 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -387,6 +387,7 @@
     "models.deprecated.vit_hybrid": ["ViTHybridConfig"],
     "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"],
     "models.depth_anything": ["DepthAnythingConfig"],
+    "models.depth_pro": ["DepthProConfig"],
     "models.detr": ["DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DinatConfig"],
@@ -408,7 +409,6 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPTConfig"],
-    "models.depth_pro": ["DepthProConfig"],
     "models.efficientnet": ["EfficientNetConfig"],
     "models.electra": [
         "ElectraConfig",
@@ -1193,10 +1193,10 @@
     _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
     _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
+    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
-    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
@@ -2078,6 +2078,13 @@
             "DepthAnythingPreTrainedModel",
         ]
     )
+    _import_structure["models.depth_pro"].extend(
+        [
+            "DepthProForDepthEstimation",
+            "DepthProModel",
+            "DepthProPreTrainedModel",
+        ]
+    )
     _import_structure["models.detr"].extend(
         [
             "DetrForObjectDetection",
@@ -2138,13 +2145,6 @@
             "DPTPreTrainedModel",
         ]
     )
-    _import_structure["models.depth_pro"].extend(
-        [
-            "DepthProForDepthEstimation",
-            "DepthProModel",
-            "DepthProPreTrainedModel",
-        ]
-    )
     _import_structure["models.efficientnet"].extend(
         [
             "EfficientNetForImageClassification",
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d8860d38f85046..a02af514b65aa1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -85,6 +85,7 @@
         ("deformable_detr", "DeformableDetrConfig"),
         ("deit", "DeiTConfig"),
         ("depth_anything", "DepthAnythingConfig"),
+        ("depth_pro", "DepthProConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
         ("dinat", "DinatConfig"),
@@ -93,7 +94,6 @@
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
-        ("depth_pro", "DepthProConfig"),
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
@@ -385,6 +385,7 @@
         ("deplot", "DePlot"),
         ("depth_anything", "Depth Anything"),
         ("depth_anything_v2", "Depth Anything V2"),
+        ("depth_pro", "DepthPro"),
         ("deta", "DETA"),
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
@@ -395,7 +396,6 @@
         ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
-        ("depth_pro", "DepthPro"),
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index e7b53f30a7a064..3887f29415b052 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -71,13 +71,13 @@
             ("deformable_detr", ("DeformableDetrImageProcessor",)),
             ("deit", ("DeiTImageProcessor",)),
             ("depth_anything", ("DPTImageProcessor",)),
+            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("deta", ("DetaImageProcessor",)),
             ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
             ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("dinov2", ("BitImageProcessor",)),
             ("donut-swin", ("DonutImageProcessor",)),
             ("dpt", ("DPTImageProcessor",)),
-            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor",)),
             ("efficientnet", ("EfficientNetImageProcessor",)),
             ("flava", ("FlavaImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4cc15ca4ca51c2..b8bcd0cbcb00a9 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -84,6 +84,7 @@
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
+        ("depth_pro", "DepthProModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
@@ -92,7 +93,6 @@
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
-        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
@@ -567,12 +567,12 @@
         ("data2vec-vision", "Data2VecVisionModel"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
+        ("depth_pro", "DepthProModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
         ("dinov2", "Dinov2Model"),
         ("dpt", "DPTModel"),
-        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
@@ -867,8 +867,8 @@
     [
         # Model for depth estimation mapping
         ("depth_anything", "DepthAnythingForDepthEstimation"),
-        ("dpt", "DPTForDepthEstimation"),
         ("depth_pro", "DepthProForDepthEstimation"),
+        ("dpt", "DPTForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
         ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index e170803cccab70..346f386ba698f2 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -20,7 +20,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from ...configuration_utils import PretrainedConfig
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 36e1ff2cfe65c4..dc32f6d653d635 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3457,6 +3457,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DepthProForDepthEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DepthProModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DepthProPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class DetrForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 19cf02a4e85826..1ceb9e227bb2d9 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -177,6 +177,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DepthProImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class DepthProImageProcessorFast(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From b4575d026de8a8ca69650c76ab3b21f22e860a48 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 10:45:19 +0500
Subject: [PATCH 37/72] fix config docstring

---
 .../models/depth_pro/configuration_depth_pro.py           | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d938f0a721f1ae..d48d68b832b472 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -38,7 +38,7 @@ class DepthProConfig(PretrainedConfig):
             The number of channels before fusion.
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         mlp_ratio (`int`, *optional*, defaults to 4):
             Ratio of the hidden size of the MLPs relative to the `hidden_size`.
@@ -58,7 +58,7 @@ class DepthProConfig(PretrainedConfig):
             To generate depth of same size as image,
             image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
             where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        patch_size (`int`, *optional*, defaults to 14):
+        patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
@@ -90,9 +90,11 @@ class DepthProConfig(PretrainedConfig):
             Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
         use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
             Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
         use_fov_model (`bool`, *optional*, defaults to `True`):
             Whether to use `DepthProFOVModel` to generate the field of view.
-        num_fov_head_layers (`int`, *optional*, defaults to `2`):
+        num_fov_head_layers (`int`, *optional*, defaults to 2):
             Number of convolution layers in the head of `DepthProFOVModel`.
 
     Example:

From c8d8a9e0ca3750cc062fe9ad3b90fdbe5a893f0b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 11:26:12 +0500
Subject: [PATCH 38/72] add depth pro to sdpa docs

---
 docs/source/en/perf_infer_gpu_one.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 67bd31fdaeede5..4f1ccc9c427c37 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -227,6 +227,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
+* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)

From 77873de8a34447d64d16e1a5def4ba8fb7109bb5 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 15:30:42 +0500
Subject: [PATCH 39/72] undo unintentional changes in configuration_gemma.py

---
 src/transformers/models/gemma/configuration_gemma.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index 346f386ba698f2..e170803cccab70 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -20,6 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from ...configuration_utils import PretrainedConfig
 
 

From 5f2378d112193317902a733d13b21fc081fc8b56 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 30 Nov 2024 23:51:55 +0500
Subject: [PATCH 40/72] minor fixes

---
 src/transformers/models/__init__.py           |  1 +
 .../depth_pro/image_processing_depth_pro.py   | 24 +++++++++++--------
 .../models/depth_pro/modeling_depth_pro.py    |  7 +-----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9155f629e63f91..fc26362dd64dc4 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -69,6 +69,7 @@
     deit,
     deprecated,
     depth_anything,
+    depth_pro,
     detr,
     dialogpt,
     dinat,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 15a33f804d145a..746f246fcd73a9 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -113,7 +113,7 @@ def __init__(
 
     def resize(
         self,
-        images: List[np.ndarray],
+        image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         antialias: bool = False,
@@ -125,8 +125,8 @@ def resize(
         Resize an image to `(size["height"], size["width"])`.
 
         Args:
-            images (`List[np.ndarray]`):
-                Images to resize.
+            image (`np.ndarray`):
+                Image to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
@@ -157,16 +157,13 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        images = np.stack(images)
-        images = torch.from_numpy(images)
-
         return torch.nn.functional.interpolate(
             # input should be (B, C, H, W)
-            input=images,
+            input=torch.from_numpy(image).unsqueeze(0),
             size=output_size,
             mode=pil_torch_interpolation_mapping[resample].value,
             antialias=antialias,
-        )
+        ).squeeze(0).numpy()
 
     def _validate_input_arguments(
         self,
@@ -321,8 +318,15 @@ def preprocess(
         # depth-pro scales the image before resizing it
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
-            images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
-            images = images.numpy()
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    antialias=antialias,
+                )
+                for image in images
+            ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 255174de09934b..16601f9c7c8621 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -660,7 +660,7 @@ def merge(patches, batch_size, merge_out_size):
         # patches are not created when scaled image size is equal to patch size
         return patches
 
-    box_size = int(math.sqrt(num_patches // batch_size))
+    box_size = math.ceil(math.sqrt(num_patches // batch_size))
     """
     merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
     padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
@@ -806,11 +806,6 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        if not (H == W == self.config.image_size):
-            raise ValueError(
-                f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config."
-            )
-
         if not (C == self.config.num_channels):
             raise ValueError(
                 f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."

From d51d0b198824370c47650ca6cc49f403e9c752cc Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 30 Nov 2024 23:57:26 +0500
Subject: [PATCH 41/72] test image processing

---
 .../test_image_processing_depth_pro.py        | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 tests/models/depth_pro/test_image_processing_depth_pro.py

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
new file mode 100644
index 00000000000000..eea9ed01378db9
--- /dev/null
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.file_utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import DepthProImageProcessor, DepthProImageProcessorFast
+
+
+class DepthProImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DepthProImageProcessor if is_vision_available() else None
+    fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = DepthProImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "antialias"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})

From 082b05555df1b7b55335d6790582f47b0e6c4ca1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 02:01:42 +0500
Subject: [PATCH 42/72] fixes and tests

---
 docs/source/en/model_doc/depth_pro.md         | 119 +++++++
 .../depth_pro/configuration_depth_pro.py      |   2 +-
 .../depth_pro/image_processing_depth_pro.py   |   1 -
 .../models/depth_pro/modeling_depth_pro.py    | 177 +++++----
 tests/models/depth_pro/__init__.py            |   0
 .../depth_pro/test_modeling_depth_pro.py      | 335 ++++++++++++++++++
 6 files changed, 558 insertions(+), 76 deletions(-)
 create mode 100644 docs/source/en/model_doc/depth_pro.md
 create mode 100644 tests/models/depth_pro/__init__.py
 create mode 100644 tests/models/depth_pro/test_modeling_depth_pro.py

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
new file mode 100644
index 00000000000000..6472cc506dae72
--- /dev/null
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -0,0 +1,119 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DepthPro
+
+## Overview
+
+The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
+
+It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt) like decoder.
+
+The abstract from the paper is the following:
+
+*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
+
+<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/b2cd0d51daa95e49277a9f642f7fd736b7f9e91d/data/depth-pro-teaser.jpg"
+alt="drawing" width="600"/>
+
+<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
+
+This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro).
+
+<!-- TODO -->
+
+## Usage tips
+
+```python
+from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation
+
+# initialize with a Transformer-based backbone such as DINOv2
+# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width)
+backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False)
+
+config = DepthProConfig(backbone_config=backbone_config)
+model = DepthProForDepthEstimation(config=config)
+```
+
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTForImageClassification
+model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro.
+
+- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro).
+
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DepthProConfig
+
+[[autodoc]] DepthProConfig
+
+## DepthProFeatureExtractor
+
+[[autodoc]] DepthProFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## DepthProImageProcessor
+
+[[autodoc]] DepthProImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## DepthProModel
+
+[[autodoc]] DepthProModel
+    - forward
+
+## DepthProForDepthEstimation
+
+[[autodoc]] DepthProForDepthEstimation
+    - forward
+
+## DepthProForSemanticSegmentation
+
+[[autodoc]] DepthProForSemanticSegmentation
+    - forward
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d48d68b832b472..beb3215d8ddf8d 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -56,7 +56,7 @@ class DepthProConfig(PretrainedConfig):
         image_size (`int`, *optional*, defaults to 1536):
             The size (resolution) of each image,
             To generate depth of same size as image,
-            image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
+            image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
             where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 746f246fcd73a9..65a29900c63744 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -264,7 +264,6 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
 
         size = size if size is not None else self.size
-        size_dict = get_size_dict(size)
 
         images = make_list_of_images(images)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 16601f9c7c8621..2e074588d4e301 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -31,6 +31,7 @@
     logging,
     replace_return_docstrings,
     torch_int,
+    ModelOutput,
 )
 from .configuration_depth_pro import DepthProConfig
 
@@ -87,9 +88,9 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2
 
-        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
-        self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
@@ -698,6 +699,35 @@ def merge(patches, batch_size, merge_out_size):
     return boxes
 
 
+@dataclass
+class DepthProOutput(ModelOutput):
+    """
+    Base class for DepthPro's outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        features (`List[torch.FloatTensor]`, *optional*:
+            Features from scaled images and hidden_states.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    features: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -794,7 +824,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
+    ) -> Union[tuple, DepthProOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -848,8 +878,8 @@ def forward(
         image_encodings = self.image_encoder(
             pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_attentions=False,
+            output_hidden_states=False,
             return_dict=True,
         )
 
@@ -941,21 +971,36 @@ def forward(
         scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
 
         # STEP 8: return these features in order of increasing size as what fusion expects
-        last_hidden_state = [
+        features = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
 
-        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
-        attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
+        # prepare last_hidden_state, hidden_states, attentions from patches to batches
+
+        last_hidden_state = patch_encodings.last_hidden_state
+        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions if output_attentions else None
+
+        num_patches = sum(scaled_images_num_patches)
+        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
+        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
+        indexes = indexes.to(last_hidden_state.device)
+
+        last_hidden_state = last_hidden_state[indexes].mean(1)
+        if hidden_states is not None:
+            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
+        if attentions is not None:
+            attentions = tuple([state[indexes].mean(1) for state in attentions])
 
         if not return_dict:
-            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+            return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
 
-        return BaseModelOutput(
+        return DepthProOutput(
             last_hidden_state=last_hidden_state,
+            features=features,
             hidden_states=hidden_states,
             attentions=attentions,
         )
@@ -1034,11 +1079,7 @@ def __init__(self, config):
         self.post_init()
 
     def get_input_embeddings(self):
-        embeddings = {
-            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
-            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
-        }
-        return embeddings
+        return self.encoder.patch_encoder.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -1058,7 +1099,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> Union[Tuple, DepthProOutput]:
         r"""
         Returns:
 
@@ -1215,7 +1256,7 @@ def forward(self, hidden_state, residual=None):
 
 
 # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
-# with extra layer parameters, deconv and reversed layers
+# with num_layers, deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
     def __init__(self, config, num_layers):
         super().__init__()
@@ -1269,8 +1310,8 @@ def __init__(self, config: DepthProConfig) -> None:
         for i in range(config.num_fov_head_layers):
             self.head.append(
                 nn.Conv2d(
-                    self.fusion_hidden_size // 2 ** (i + 1),
-                    self.fusion_hidden_size // 2 ** (i + 2),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
                     kernel_size=3,
                     stride=2,
                     padding=1,
@@ -1278,7 +1319,7 @@ def __init__(self, config: DepthProConfig) -> None:
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1)
+        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
@@ -1291,16 +1332,7 @@ def forward(
         pixel_values: torch.Tensor,
         global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
+    ) -> torch.Tensor:
         B, C, W, H = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
@@ -1316,11 +1348,11 @@ def forward(
         encoder_outputs = self.encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
         )
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         last_hidden_state = self.encoder_neck(last_hidden_state)
         last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
         last_hidden_state = merge(
@@ -1335,15 +1367,7 @@ def forward(
         fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(B)
 
-        if not return_dict:
-            head_outputs = (fov_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=fov_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+        return fov_output
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1377,16 +1401,36 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
 
 
 @dataclass
-class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
+class DepthProDepthEstimatorOutput(ModelOutput):
     """
-    Base class for outputs of DepthProDepthEstimator.
+    Base class for DepthProForDepthEstimation's output.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
     """
 
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: torch.FloatTensor = None
     fov: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
 @add_start_docstrings(
@@ -1502,41 +1546,26 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=True,
         )
-        last_hidden_state = depth_pro_outputs.last_hidden_state
-        last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)]
-        fused_state = self.fusion_stage(last_hidden_state)
-        predicted_depth = self.head(fused_state)
+        features = depth_pro_outputs.features
+        features = [proj(feature) for proj, feature in zip(self.projections, features)]
+        fused_features = self.fusion_stage(features)
+        predicted_depth = self.head(fused_features)
 
-        if self.use_fov_model:
+        fov = self.fov_model(
+            pixel_values=pixel_values,
             # use lowest scaled image features for fov model
-            global_features = last_hidden_state[0].detach()
-            fov_encodings = self.fov_model(
-                pixel_values=pixel_values,
-                global_features=global_features,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=True,
-            )
-            fov = fov_encodings.last_hidden_state
-            attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = (
-                depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
-            )
-        else:
-            fov = None
-            attentions = depth_pro_outputs.attentions
-            hidden_states = depth_pro_outputs.hidden_states
+            global_features=features[0].detach(),
+            head_mask=head_mask,
+        ) if self.use_fov_model else None
 
         if not return_dict:
-            outputs = (predicted_depth, fov, hidden_states, attentions)
-            outputs = (i for i in outputs if i is not None)
-            return outputs
+            outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
+            return tuple(v for v in outputs if v is not None)
 
         return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
             fov=fov,
-            hidden_states=hidden_states,
-            attentions=attentions,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
         )
diff --git a/tests/models/depth_pro/__init__.py b/tests/models/depth_pro/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
new file mode 100644
index 00000000000000..3d37965dcd1bd0
--- /dev/null
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DepthPro model."""
+
+import unittest
+
+from transformers import DepthProConfig
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import DepthProForDepthEstimation, DepthProModel
+    from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DepthProImageProcessor
+
+
+class DepthProModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        image_size=64,
+        patch_size=8,
+        patch_embeddings_size=4,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        fusion_hidden_size=16,
+        intermediate_hook_ids=[1, 0],
+        intermediate_feature_dims=[8, 8],
+        scaled_images_ratios=[0.5, 1.0],
+        scaled_images_overlap_ratios=[0.0, 0.2],
+        scaled_images_feature_dims=[12, 12],
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        use_fov_model=True,
+        num_labels=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_embeddings_size = patch_embeddings_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.fusion_hidden_size = fusion_hidden_size
+        self.intermediate_hook_ids = intermediate_hook_ids
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.use_fov_model = use_fov_model
+        self.num_labels = num_labels
+
+        self.num_patches = (patch_size // patch_embeddings_size) ** 2
+        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return DepthProConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            patch_embeddings_size=self.patch_embeddings_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            fusion_hidden_size=self.fusion_hidden_size,
+            intermediate_hook_ids=self.intermediate_hook_ids,
+            intermediate_feature_dims=self.intermediate_feature_dims,
+            scaled_images_ratios=self.scaled_images_ratios,
+            scaled_images_overlap_ratios=self.scaled_images_overlap_ratios,
+            scaled_images_feature_dims=self.scaled_images_feature_dims,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            use_fov_model=self.use_fov_model,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DepthProModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = DepthProForDepthEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as DepthPro does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (DepthProModel, DepthProForDepthEstimation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "depth-estimation": DepthProForDepthEstimation,
+            "image-feature-extraction": DepthProModel,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DepthProModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="DepthPro does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_depth_estimation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
+
+    def test_training(self):
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "DepthProForDepthEstimation":
+                continue
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
+                continue
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "DepthProForDepthEstimation":
+                continue
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing:
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            backbone_params = []
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "DepthProViTHybridEmbeddings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in backbone_params:
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Intel/depth_pro-large"
+        model = DepthProModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+@slow
+class DepthProModelIntegrationTest(unittest.TestCase):
+    def test_inference_depth_estimation(self):
+        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
+        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        # verify the predicted depth
+        expected_shape = torch.Size((1, 384, 384))
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_post_processing_depth_estimation(self):
+        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
+        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
+        expected_shape = torch.Size((384, 384))
+        self.assertTrue(predicted_depth.shape == expected_shape)
+
+        predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
+        predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
+        expected_shape = torch.Size((500, 500))
+        self.assertTrue(predicted_depth_l.shape == expected_shape)
+
+        output_enlarged = torch.nn.functional.interpolate(
+            predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
+        ).squeeze()
+        self.assertTrue(output_enlarged.shape == expected_shape)
+        self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))

From 16a39178307e3d2b484fb0df44e3ff05e0b67aff Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 02:20:22 +0500
Subject: [PATCH 43/72] more fixes

---
 docs/source/en/model_doc/depth_pro.md         | 19 +++++++------------
 .../depth_pro/configuration_depth_pro.py      | 10 ----------
 .../models/depth_pro/modeling_depth_pro.py    |  4 ++--
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 6472cc506dae72..7e4ac13f1d648f 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -91,17 +91,17 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DepthProConfig
 
-## DepthProFeatureExtractor
-
-[[autodoc]] DepthProFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
 ## DepthProImageProcessor
 
 [[autodoc]] DepthProImageProcessor
     - preprocess
-    - post_process_semantic_segmentation
+    - post_process_depth_estimation
+
+## DepthProImageProcessorFast
+
+[[autodoc]] DepthProImageProcessorFast
+    - preprocess
+    - post_process_depth_estimation
 
 ## DepthProModel
 
@@ -112,8 +112,3 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DepthProForDepthEstimation
     - forward
-
-## DepthProForSemanticSegmentation
-
-[[autodoc]] DepthProForSemanticSegmentation
-    - forward
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index beb3215d8ddf8d..46220a0731e6f7 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -72,12 +72,6 @@ class DepthProConfig(PretrainedConfig):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
             Whether to use the SwiGLU feedforward neural network.
-        apply_layernorm (`bool`, *optional*, defaults to `True`):
-            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
-        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
-            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
-            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
-            seq_len, hidden_size)`.
         intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
             Indices of the intermediate hidden states from the patch encoder to use for fusion.
         intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
@@ -134,8 +128,6 @@ def __init__(
         layerscale_value=1.0,
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
-        apply_layernorm=True,
-        reshape_hidden_states=True,
         intermediate_hook_ids=[11, 5],
         intermediate_feature_dims=[256, 256],
         scaled_images_ratios=[0.25, 0.5, 1],
@@ -167,8 +159,6 @@ def __init__(
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
-        self.apply_layernorm = apply_layernorm
-        self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
         self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 2e074588d4e301..27754c5dbafcbf 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -22,16 +22,16 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
+    ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
     torch_int,
-    ModelOutput,
 )
 from .configuration_depth_pro import DepthProConfig
 

From 2408ec54e4f27d2abbecdb8374e58f34d91d8e96 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 12:18:09 +0500
Subject: [PATCH 44/72] use output states from image_encoder instead

---
 .../models/depth_pro/modeling_depth_pro.py    | 49 ++++++++-----------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 27754c5dbafcbf..00241bb8646582 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = self.position_embeddings.shape[1] - 1
+        num_positions = embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 
@@ -734,6 +734,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
+        self.patch_size = config.patch_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -867,7 +868,7 @@ def forward(
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=output_attentions,
+            output_attentions=False,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
@@ -875,11 +876,18 @@ def forward(
             patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
+        # scale the image to patch size for image_encoder
+        scaled_image_to_patch_size = nn.functional.interpolate(
+            pixel_values,
+            size=(self.patch_size, self.patch_size),
+            mode="bilinear",
+            align_corners=False,
+        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0],  # provide least resolution image
+            pixel_values=scaled_image_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
             return_dict=True,
         )
 
@@ -946,19 +954,15 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        )  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = merge(
-            image_features,
-            batch_size=B,
-            merge_out_size=self.out_size * 2 ** (0),
-        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+        # no merge required for image_features as they are already in batches instead of patches
 
         # d. upsample
         image_features = self.upsample_image(
@@ -980,20 +984,9 @@ def forward(
 
         # prepare last_hidden_state, hidden_states, attentions from patches to batches
 
-        last_hidden_state = patch_encodings.last_hidden_state
-        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
-        attentions = patch_encodings.attentions if output_attentions else None
-
-        num_patches = sum(scaled_images_num_patches)
-        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
-        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
-        indexes = indexes.to(last_hidden_state.device)
-
-        last_hidden_state = last_hidden_state[indexes].mean(1)
-        if hidden_states is not None:
-            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
-        if attentions is not None:
-            attentions = tuple([state[indexes].mean(1) for state in attentions])
+        last_hidden_state = image_encodings.last_hidden_state
+        hidden_states = image_encodings.hidden_states if output_hidden_states else None
+        attentions = image_encodings.attentions if output_attentions else None
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)

From be0c2a37478589c31d5b3864f16b955f952b43cd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 09:13:25 +0500
Subject: [PATCH 45/72] Revert "use output states from image_encoder instead"

This reverts commit 2408ec54e4f27d2abbecdb8374e58f34d91d8e96.
---
 .../models/depth_pro/modeling_depth_pro.py    | 49 +++++++++++--------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 00241bb8646582..27754c5dbafcbf 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 
@@ -734,7 +734,6 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
-        self.patch_size = config.patch_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -868,7 +867,7 @@ def forward(
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=False,
+            output_attentions=output_attentions,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
@@ -876,18 +875,11 @@ def forward(
             patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
-        # scale the image to patch size for image_encoder
-        scaled_image_to_patch_size = nn.functional.interpolate(
-            pixel_values,
-            size=(self.patch_size, self.patch_size),
-            mode="bilinear",
-            align_corners=False,
-        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_image_to_patch_size,
+            pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_attentions=False,
+            output_hidden_states=False,
             return_dict=True,
         )
 
@@ -954,15 +946,19 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        # no merge required for image_features as they are already in batches instead of patches
+        image_features = merge(
+            image_features,
+            batch_size=B,
+            merge_out_size=self.out_size * 2 ** (0),
+        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
         image_features = self.upsample_image(
@@ -984,9 +980,20 @@ def forward(
 
         # prepare last_hidden_state, hidden_states, attentions from patches to batches
 
-        last_hidden_state = image_encodings.last_hidden_state
-        hidden_states = image_encodings.hidden_states if output_hidden_states else None
-        attentions = image_encodings.attentions if output_attentions else None
+        last_hidden_state = patch_encodings.last_hidden_state
+        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions if output_attentions else None
+
+        num_patches = sum(scaled_images_num_patches)
+        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
+        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
+        indexes = indexes.to(last_hidden_state.device)
+
+        last_hidden_state = last_hidden_state[indexes].mean(1)
+        if hidden_states is not None:
+            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
+        if attentions is not None:
+            attentions = tuple([state[indexes].mean(1) for state in attentions])
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)

From efed39f86e629a56df892f45dcbb5d4dc05222a4 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 09:18:16 +0500
Subject: [PATCH 46/72] make embeddings dynamic

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 27754c5dbafcbf..4f97f37230cbbb 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -80,6 +80,7 @@ class DepthProViTEmbeddings(nn.Module):
     """
     Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings
     except antialias=True in interpolation and removal of mask_token
+    and enabling dynamic embeddings.
     """
 
     def __init__(self, config: DepthProConfig) -> None:
@@ -103,7 +104,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = self.position_embeddings.shape[1] - 1
+        num_positions = embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 

From c3b14fbcc54a1877bf6ebb7b7b61d9d67f1753ce Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 10:58:45 +0500
Subject: [PATCH 47/72] reshape output hidden states and attentions as part of
 computation graph

---
 .../models/depth_pro/modeling_depth_pro.py    | 114 +++++++++++++-----
 .../depth_pro/test_modeling_depth_pro.py      |   3 +-
 2 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 4f97f37230cbbb..6f20838375cf84 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -42,6 +42,25 @@
 _CONFIG_FOR_DOC = "DepthProConfig"
 
 
+def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
+    """
+    converts tensor from shape:
+    (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
+    """
+    data = data.reshape(-1, batch_size, *data.shape[1:])
+    data = data.transpose(0, 1)
+    return data
+
+def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
+    """
+    converts tensor from shape:
+    (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
+    """
+    data = data.transpose(0, 1)
+    data = data.reshape(-1, *data.shape[2:])
+    return data
+
+
 class DepthProViTPatchEmbeddings(nn.Module):
     """
     Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
@@ -135,13 +154,17 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        batch_size, _, height, width = pixel_values.shape
+    def forward(
+            self,
+            pixel_values: torch.Tensor,
+            batch_size: Optional[int] = None,
+        ) -> torch.Tensor:
+        n, _, height, width = pixel_values.shape
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
 
         # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        cls_tokens = self.cls_token.expand(n, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
 
         # add positional encoding to each token
@@ -149,11 +172,14 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         embeddings = self.dropout(embeddings)
 
+        if batch_size is not None:
+            embeddings = patch_to_batch(embeddings, batch_size)
+
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro
 class DepthProViTSelfAttention(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -172,13 +198,20 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
+    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
+    # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro
+    # with the addition of `batch_size`
     def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
@@ -202,25 +235,37 @@ def forward(
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+        if batch_size is not None:
+            attention_probs_batched = patch_to_batch(attention_probs, batch_size)
+            attention_probs_patched = batch_to_patch(attention_probs_batched)
+        else:
+            attention_probs_patched = attention_probs_batched = attention_probs
+
+        context_layer = torch.matmul(attention_probs_patched, value_layer)
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
 
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = (context_layer, attention_probs_batched) if output_attentions else (context_layer,)
 
         return outputs
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
 
+    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
+    # with the addition of `batch_size`
     def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -229,7 +274,7 @@ def forward(
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
+                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size,
             )
 
         mixed_query_layer = self.query(hidden_states)
@@ -274,14 +319,15 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.attention = DepthProViTSelfAttention(config)
         self.output = DepthProViTSelfOutput(config)
         self.pruned_heads = set()
 
+    # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
     def prune_heads(self, heads: Set[int]) -> None:
         if len(heads) == 0:
             return
@@ -300,13 +346,16 @@ def prune_heads(self, heads: Set[int]) -> None:
         self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
+    # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions, batch_size)
 
         attention_output = self.output(self_outputs[0], hidden_states)
 
@@ -411,10 +460,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
@@ -431,16 +480,23 @@ def __init__(self, config: DepthProConfig) -> None:
             self.mlp = DepthProViTMLP(config)
         self.layer_scale2 = DepthProViTLayerScale(config)
 
+    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        if batch_size is not None:
+            hidden_states = batch_to_patch(hidden_states)
+
         self_attention_outputs = self.attention(
             self.norm1(hidden_states),  # in DepthProViT, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
+            batch_size=batch_size,
         )
         attention_output = self_attention_outputs[0]
 
@@ -458,19 +514,24 @@ def forward(
         # second residual connection
         layer_output = self.drop_path(layer_output) + hidden_states
 
+        if batch_size is not None:
+            layer_output = patch_to_batch(layer_output, batch_size)
+
         outputs = (layer_output,) + outputs
 
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
+    # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -478,6 +539,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
+        batch_size: Optional[int] = None,
     ) -> Union[tuple, BaseModelOutput]:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -494,9 +556,10 @@ def forward(
                     hidden_states,
                     layer_head_mask,
                     output_attentions,
+                    batch_size,
                 )
             else:
-                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, batch_size)
 
             hidden_states = layer_outputs[0]
 
@@ -532,6 +595,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -542,7 +606,7 @@ def forward(
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
-        embedding_output = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(pixel_values, batch_size=batch_size)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -550,6 +614,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            batch_size=batch_size,
         )
         sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
@@ -871,9 +936,12 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
+            batch_size=B,
         )
+        last_hidden_state = patch_encodings.last_hidden_state
+        last_hidden_state = batch_to_patch(last_hidden_state)
         scaled_images_last_hidden_state = torch.split_with_sizes(
-            patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
+            last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
@@ -917,6 +985,7 @@ def forward(
                 self.intermediate_hook_ids[i] + 1
             )  # +1 to correct index position as hidden_states contain embedding output as well
             hidden_state = patch_encodings.hidden_states[layer_id]
+            hidden_state = batch_to_patch(hidden_state)
             hidden_state = hidden_state[
                 : scaled_images_num_patches[-1]
             ]  # num_patches to be of same length as highest resolution
@@ -985,17 +1054,6 @@ def forward(
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
 
-        num_patches = sum(scaled_images_num_patches)
-        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
-        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
-        indexes = indexes.to(last_hidden_state.device)
-
-        last_hidden_state = last_hidden_state[indexes].mean(1)
-        if hidden_states is not None:
-            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
-        if attentions is not None:
-            attentions = tuple([state[indexes].mean(1) for state in attentions])
-
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 3d37965dcd1bd0..9e881cf273b7b9 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -131,7 +131,8 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size))
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         config.num_labels = self.num_labels

From 7cf2485adef235b906b469a38002a8dacc3d0537 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 11:14:21 +0500
Subject: [PATCH 48/72] fix ruff formating

---
 .../depth_pro/image_processing_depth_pro.py   | 18 ++++++----
 .../models/depth_pro/modeling_depth_pro.py    | 36 +++++++++++--------
 .../depth_pro/test_modeling_depth_pro.py      |  8 +++--
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 65a29900c63744..164c7e28c6e237 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -157,13 +157,17 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        return torch.nn.functional.interpolate(
-            # input should be (B, C, H, W)
-            input=torch.from_numpy(image).unsqueeze(0),
-            size=output_size,
-            mode=pil_torch_interpolation_mapping[resample].value,
-            antialias=antialias,
-        ).squeeze(0).numpy()
+        return (
+            torch.nn.functional.interpolate(
+                # input should be (B, C, H, W)
+                input=torch.from_numpy(image).unsqueeze(0),
+                size=output_size,
+                mode=pil_torch_interpolation_mapping[resample].value,
+                antialias=antialias,
+            )
+            .squeeze(0)
+            .numpy()
+        )
 
     def _validate_input_arguments(
         self,
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 6f20838375cf84..8fa286c70919f3 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -51,6 +51,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     data = data.transpose(0, 1)
     return data
 
+
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
     converts tensor from shape:
@@ -155,10 +156,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
     def forward(
-            self,
-            pixel_values: torch.Tensor,
-            batch_size: Optional[int] = None,
-        ) -> torch.Tensor:
+        self,
+        pixel_values: torch.Tensor,
+        batch_size: Optional[int] = None,
+    ) -> torch.Tensor:
         n, _, height, width = pixel_values.shape
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
@@ -274,7 +275,10 @@ def forward(
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size,
+                hidden_states=hidden_states,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                batch_size=batch_size,
             )
 
         mixed_query_layer = self.query(hidden_states)
@@ -940,9 +944,9 @@ def forward(
         )
         last_hidden_state = patch_encodings.last_hidden_state
         last_hidden_state = batch_to_patch(last_hidden_state)
-        scaled_images_last_hidden_state = torch.split_with_sizes(
-            last_hidden_state, scaled_images_num_patches[::-1]
-        )[::-1]  # -1 as patch encoder expects high res patches first
+        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1])
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+        # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
             pixel_values=scaled_images[0],  # provide least resolution image
@@ -1610,12 +1614,16 @@ def forward(
         fused_features = self.fusion_stage(features)
         predicted_depth = self.head(fused_features)
 
-        fov = self.fov_model(
-            pixel_values=pixel_values,
-            # use lowest scaled image features for fov model
-            global_features=features[0].detach(),
-            head_mask=head_mask,
-        ) if self.use_fov_model else None
+        fov = (
+            self.fov_model(
+                pixel_values=pixel_values,
+                # use lowest scaled image features for fov model
+                global_features=features[0].detach(),
+                head_mask=head_mask,
+            )
+            if self.use_fov_model
+            else None
+        )
 
         if not return_dict:
             outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 9e881cf273b7b9..e350b067a118c8 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -91,7 +91,7 @@ def __init__(
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
-        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token
+        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -131,8 +131,10 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size))
+        num_patches = result.last_hidden_state.shape[1]  # num_patches are created dynamically
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)
+        )
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         config.num_labels = self.num_labels

From 0aa451df3e6862291d2097d5a1e6aa5e9aa91f23 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 22:41:11 +0500
Subject: [PATCH 49/72] fix docstring failure

---
 .../models/depth_pro/modeling_depth_pro.py       | 16 +++++++++++++++-
 utils/check_docstrings.py                        |  1 -
 utils/check_repo.py                              |  1 -
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8fa286c70919f3..1498ce4003d39b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1496,11 +1496,25 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+"""
+
+
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
     """,
-    DEPTH_PRO_START_DOCSTRING,
+    DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING,
 )
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
     def __init__(self, config, use_fov_model=None):
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 34deed0df47e01..0be960f4a33e6d 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -140,7 +140,6 @@
     "DPRReaderTokenizer",
     "DPRReaderTokenizerFast",
     "DPTModel",
-    "DepthProModel",
     "Data2VecAudioConfig",
     "Data2VecTextConfig",
     "Data2VecTextModel",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2e131e8791530e..10be5cdcd26230 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -213,7 +213,6 @@
     "JukeboxPrior",
     "SamModel",
     "DPTForDepthEstimation",
-    "DepthProForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
     "ViltForImagesAndTextClassification",

From 160afbf57789906a134000a5b6ee99982cf4ae6f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 23:32:39 +0500
Subject: [PATCH 50/72] use num_fov_head_layers in tests

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index e350b067a118c8..03f69e8ad1fee5 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -64,6 +64,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         use_fov_model=True,
+        num_fov_head_layers=0,
         num_labels=3,
     ):
         self.parent = parent
@@ -88,6 +89,7 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
@@ -124,6 +126,7 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             use_fov_model=self.use_fov_model,
+            num_fov_head_layers=self.num_fov_head_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):

From 9d2be2603d9a75346526b2a37711c6edc40125c8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:30:08 +0500
Subject: [PATCH 51/72] update doc

---
 docs/source/en/model_doc/depth_pro.md | 37 +++++++++++++++++----------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 7e4ac13f1d648f..041c4d49dffc93 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
 
-<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/b2cd0d51daa95e49277a9f642f7fd736b7f9e91d/data/depth-pro-teaser.jpg"
+<img src="https://huggingface.co/geetu040/DepthPro/resolve/main/assets/architecture.jpg"
 alt="drawing" width="600"/>
 
 <small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
@@ -38,16 +38,26 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi
 ## Usage tips
 
 ```python
-from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation
+from transformers import DepthProConfig, DepthProForDepthEstimation
 
-# initialize with a Transformer-based backbone such as DINOv2
-# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width)
-backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False)
+config = DepthProConfig()
+model = DPTForDepthEstimation(config=config)
+```
+
+- By default model takes an input image of size `1536`, this can be changed via config, however the model is compatible with images of different width and height.
+- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, then each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`.
+- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder.
+- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config.
+- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size.
+- To generate `predicted_depth` of the same size as input image, make sure the config is created such that
+```
+image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
 
-config = DepthProConfig(backbone_config=backbone_config)
-model = DepthProForDepthEstimation(config=config)
+where
+n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
 ```
 
+
 ### Using Scaled Dot Product Attention (SDPA)
 
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
@@ -59,9 +69,9 @@ page for more information.
 SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
 `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
 
-```
-from transformers import ViTForImageClassification
-model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+```py
+from transformers import DepthProForDepthEstimation
+model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16)
 ...
 ```
 
@@ -78,12 +88,11 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
 
 ## Resources
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro.
+- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
 
-- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro).
+- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
 
-- [Semantic segmentation task guide](../tasks/semantic_segmentation)
-- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+<!-- A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. -->
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 

From e208459cebe6b8f821aa14e0d9e7735466751daf Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:38:32 +0500
Subject: [PATCH 52/72] check consistency with config

---
 .../models/depth_pro/modeling_depth_pro.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 1498ce4003d39b..605ea38ea736e7 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -838,6 +838,23 @@ def __init__(self, config: DepthProConfig) -> None:
                 f"by patch_embeddings_size={config.patch_embeddings_size}."
             )
 
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
+        if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # intermediate_hook_ids, intermediate_feature_dims are consistent
+        if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+
         # patch encoder
         self.patch_encoder = DepthProViT(config)
 

From 0415722bd6dd44f4b7d56d0cacf8cdd3f958cb41 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:42:31 +0500
Subject: [PATCH 53/72] ruff formatting

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 605ea38ea736e7..040b9eb07962e6 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -839,7 +839,11 @@ def __init__(self, config: DepthProConfig) -> None:
             )
 
         # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
-        if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)):
+        if not (
+            len(config.scaled_images_ratios)
+            == len(config.scaled_images_overlap_ratios)
+            == len(config.scaled_images_feature_dims)
+        ):
             raise ValueError(
                 f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
                 f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "

From f4e7404191244a86a91d5e93c3be82ffa7d6b970 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 10:57:52 +0500
Subject: [PATCH 54/72] update test case

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 03f69e8ad1fee5..54c5e870a258f3 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -52,12 +52,12 @@ def __init__(
         use_labels=True,
         hidden_size=32,
         fusion_hidden_size=16,
-        intermediate_hook_ids=[1, 0],
-        intermediate_feature_dims=[8, 8],
+        intermediate_hook_ids=[0],
+        intermediate_feature_dims=[8],
         scaled_images_ratios=[0.5, 1.0],
         scaled_images_overlap_ratios=[0.0, 0.2],
         scaled_images_feature_dims=[12, 12],
-        num_hidden_layers=2,
+        num_hidden_layers=1,
         num_attention_heads=4,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
@@ -95,6 +95,9 @@ def __init__(
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
+        n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
+        self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size
+
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
@@ -145,7 +148,7 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 2c1cc10ee8ddefce3649dac81144e5095ee00ba8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 14:55:06 +0500
Subject: [PATCH 55/72] fix ruff formatting

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 54c5e870a258f3..215756d45e99b9 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -96,7 +96,7 @@ def __init__(
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size
+        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -148,7 +148,9 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size))
+        self.parent.assertEqual(
+            result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 871b80db318a8e8b2b70533acd62cbcec678cc74 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 10:42:02 +0500
Subject: [PATCH 56/72] add tests for fov

---
 .../depth_pro/test_modeling_depth_pro.py      | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 215756d45e99b9..48983c9aca3a36 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -63,8 +63,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
-        use_fov_model=True,
-        num_fov_head_layers=0,
+        use_fov_model=False,
         num_labels=3,
     ):
         self.parent = parent
@@ -89,7 +88,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
-        self.num_fov_head_layers = num_fov_head_layers
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
@@ -129,7 +127,6 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             use_fov_model=self.use_fov_model,
-            num_fov_head_layers=self.num_fov_head_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -152,6 +149,36 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
             result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)
         )
 
+    def create_and_check_for_fov(self, config, pixel_values, labels):
+        model = DepthProForDepthEstimation(config, use_fov_model=True)
+        model.to(torch_device)
+        model.eval()
+
+        # check if the fov_model (DinoV2-based encoder) is created
+        self.parent.assertIsNotNone(model.fov_model)
+
+        batched_pixel_values = pixel_values
+        row_pixel_values = pixel_values[:1]
+
+        with torch.no_grad():
+            model_batched_output_fov = model(batched_pixel_values).fov
+            model_row_output_fov = model(row_pixel_values).fov
+
+        # check if fov is returned
+        self.parent.assertIsNotNone(model_batched_output_fov)
+        self.parent.assertIsNotNone(model_row_output_fov)
+
+        # check output shape consistency for fov
+        self.parent.assertEqual(model_batched_output_fov.shape, (self.batch_size,))
+
+        # check equivalence between batched and single row outputs for fov
+        diff = torch.max(torch.abs(model_row_output_fov - model_batched_output_fov[:1]))
+        model_name = model.__class__.__name__
+        self.parent.assertTrue(
+            diff <= 1e-03,
+            msg=(f"Batched and Single row outputs are not equal in {model_name} for fov. " f"Difference={diff}."),
+        )
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
@@ -208,6 +235,10 @@ def test_for_depth_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
 
+    def test_for_fov(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_fov(*config_and_inputs)
+
     def test_training(self):
         for model_class in self.all_model_classes:
             if model_class.__name__ == "DepthProForDepthEstimation":

From 0ff06556163a39f90eede4d5e889554e46b9de46 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 15:11:06 +0500
Subject: [PATCH 57/72] use interpolation in postprocess

---
 .../models/depth_pro/image_processing_depth_pro.py         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 164c7e28c6e237..228c3d992457e4 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -393,10 +393,11 @@ def post_process_depth_estimation(
                     outputs["fov"].append(fov)
 
                 # interpolate
-                predicted_depth = self.resize(
-                    predicted_depth.unsqueeze(0).unsqueeze(1),
+                predicted_depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
-                    resample=self.resample,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 

From befa6cdbca6194a4fab82c9865bfb9deeebe54c7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 15:26:50 +0500
Subject: [PATCH 58/72] run and fix slow tests locally

---
 .../depth_pro/test_modeling_depth_pro.py      | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 48983c9aca3a36..a3026801d59379 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -94,7 +94,7 @@ def __init__(
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size
+        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size // patch_embeddings_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -313,8 +313,8 @@ def test_initialization(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "Intel/depth_pro-large"
-        model = DepthProModel.from_pretrained(model_name)
+        model_path = "geetu040/DepthPro"
+        model = DepthProModel.from_pretrained(model_path)
         self.assertIsNotNone(model)
 
 
@@ -329,8 +329,10 @@ def prepare_img():
 @slow
 class DepthProModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
-        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device)
+        model_path = "geetu040/DepthPro"
+        image_processor = DepthProImageProcessor.from_pretrained(model_path)
+        model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device)
+        config = model.config
 
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
@@ -341,18 +343,21 @@ def test_inference_depth_estimation(self):
             predicted_depth = outputs.predicted_depth
 
         # verify the predicted depth
-        expected_shape = torch.Size((1, 384, 384))
+        n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        expected_depth_size = 2 ** (n_fusion_blocks + 1) * config.patch_size // config.patch_embeddings_size
+        expected_shape = torch.Size((1, expected_depth_size, expected_depth_size))
         self.assertEqual(predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]
+            [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_post_processing_depth_estimation(self):
-        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
-        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large")
+        model_path = "geetu040/DepthPro"
+        image_processor = DepthProImageProcessor.from_pretrained(model_path)
+        model = DepthProForDepthEstimation.from_pretrained(model_path)
 
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
@@ -361,17 +366,15 @@ def test_post_processing_depth_estimation(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
-        expected_shape = torch.Size((384, 384))
-        self.assertTrue(predicted_depth.shape == expected_shape)
-
-        predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
-        predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
-        expected_shape = torch.Size((500, 500))
-        self.assertTrue(predicted_depth_l.shape == expected_shape)
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+        target_size = [[image.height, image.width]] * len(predicted_depth)
 
-        output_enlarged = torch.nn.functional.interpolate(
-            predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
-        ).squeeze()
-        self.assertTrue(output_enlarged.shape == expected_shape)
-        self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))
+        outputs = image_processor.post_process_depth_estimation(
+            predicted_depths=predicted_depth,
+            fovs=fov,
+            target_sizes=target_size,
+        )
+        predicted_depth = outputs["predicted_depth"][0]
+        expected_shape = torch.Size((image.height, image.width))
+        self.assertTrue(predicted_depth.shape == expected_shape)

From 99ac5e81cc98b9297a81af784bf227179f1609e3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 19:53:22 +0500
Subject: [PATCH 59/72] use scaled_images_features for image and fov encoder

---
 .../models/depth_pro/modeling_depth_pro.py    | 80 ++++++++++---------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 040b9eb07962e6..f77e24925c88b1 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -959,7 +959,8 @@ def forward(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=True,  # required for intermediate features
+            # required for intermediate features
+            output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
             return_dict=True,
             batch_size=B,
         )
@@ -969,12 +970,16 @@ def forward(
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
         # -1 as patch encoder expects high res patches first
 
+        # scale the image to patch size for image_encoder
+        image_scaled_to_patch_size = nn.functional.interpolate(
+            pixel_values,
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
+        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0],  # provide least resolution image
+            pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
         )
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
@@ -1041,19 +1046,15 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        )  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = merge(
-            image_features,
-            batch_size=B,
-            merge_out_size=self.out_size * 2 ** (0),
-        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+        # no merge required for image_features as they are already in batches instead of patches
 
         # d. upsample
         image_features = self.upsample_image(
@@ -1073,8 +1074,6 @@ def forward(
             *intermediate_features,
         ]
 
-        # prepare last_hidden_state, hidden_states, attentions from patches to batches
-
         last_hidden_state = patch_encodings.last_hidden_state
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
@@ -1420,35 +1419,42 @@ def forward(
         B, C, W, H = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
-        pixel_values = interpolate(
-            pixel_values,
-            scale_factor=self.config.scaled_images_ratios[0],  # same ratio as lowest ratioed image
-        )
-        patches = patch(
+        # except for the extra encoder_neck layer applied
+
+        image_scaled_to_patch_size = nn.functional.interpolate(
             pixel_values,
-            patch_size=self.config.patch_size,
-            overlap_ratio=self.config.scaled_images_overlap_ratios[0],
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
         )
-        encoder_outputs = self.encoder(
-            patches,
+        encodings = self.encoder(
+            image_scaled_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-        )
-        last_hidden_state = encoder_outputs.last_hidden_state
-        last_hidden_state = self.encoder_neck(last_hidden_state)
-        last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
-        last_hidden_state = merge(
-            last_hidden_state,
-            batch_size=B,
-            merge_out_size=self.out_size,
         )
 
+        # a. extract hidden_state
+        hidden_state = (
+            encodings.last_hidden_state
+        )  # (B, self.seq_len+1, config.hidden_size)
+        # extra step
+        hidden_state = self.encoder_neck(hidden_state)
+        # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)
+
+        # b. reshape back to image like
+        fov_features = reshape_feature(
+            hidden_state, self.out_size, self.out_size
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
+
+        # c. merge patches back together
+        # no merge required for fov_features as they are already in batches instead of patches
+
+        # d. upsample
+        # no upsampling required for fov_features, the head later downsamples to create scalars
+
         global_features = self.global_neck(global_features)
 
-        last_hidden_state = last_hidden_state + global_features
-        fov_output = self.head(last_hidden_state)
+        fov_features = fov_features + global_features
+        fov_output = self.head(fov_features)
         fov_output = fov_output.reshape(B)
 
         return fov_output
@@ -1652,7 +1658,7 @@ def forward(
         fov = (
             self.fov_model(
                 pixel_values=pixel_values,
-                # use lowest scaled image features for fov model
+                # frozon features from encoder are used
                 global_features=features[0].detach(),
                 head_mask=head_mask,
             )

From ebb62dd2190a164d8f4cfbb218cd7c2099515ae1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:28:32 +0500
Subject: [PATCH 60/72] return fused_hidden_states in fusion stage

---
 .../models/depth_pro/modeling_depth_pro.py    | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f77e24925c88b1..91758a3db485fb 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -765,7 +765,6 @@ def merge(patches, batch_size, merge_out_size):
         boxes.append(boxes_in_row)
 
     boxes = torch.cat(boxes, dim=-2)
-    boxes = boxes[..., :merge_out_size, :merge_out_size]
     return boxes
 
 
@@ -1303,7 +1302,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 
 
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
-# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs)
+# except it uses deconv annd skip_add
 class DepthProFeatureFusionLayer(nn.Module):
     def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         super().__init__()
@@ -1328,6 +1327,10 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
 
     def forward(self, hidden_state, residual=None):
         if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
             hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
 
         hidden_state = self.residual_layer2(hidden_state)
@@ -1357,13 +1360,17 @@ def forward(self, hidden_states):
                 f"doesnot match len(hidden_states)={len(hidden_states)}"
             )
 
-        # first layer only uses the last hidden_state
-        fused_hidden_state = self.layers[0](hidden_states[0])
-        # looping from the second layer to last layer
-        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
-            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
 
-        return fused_hidden_state
+        return fused_hidden_states
 
 
 class DepthProFOVModel(nn.Module):
@@ -1652,8 +1659,8 @@ def forward(
         )
         features = depth_pro_outputs.features
         features = [proj(feature) for proj, feature in zip(self.projections, features)]
-        fused_features = self.fusion_stage(features)
-        predicted_depth = self.head(fused_features)
+        fused_hidden_states = self.fusion_stage(features)
+        predicted_depth = self.head(fused_hidden_states[-1])
 
         fov = (
             self.fov_model(

From 46c88e8bd3ba4dc2331b81fad1a54a4b902445e7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:44:44 +0500
Subject: [PATCH 61/72] fix example

---
 .../models/depth_pro/modeling_depth_pro.py             | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 91758a3db485fb..8f1609b6fb1514 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1206,14 +1206,8 @@ def forward(
         >>> with torch.no_grad():
         ...     output = model(**inputs)
         ...
-        >>> for state in output.last_hidden_state:
-        ...     print(state.shape)
-        ...
-        torch.Size([1, 1024, 48, 48])
-        torch.Size([1, 1024, 96, 96])
-        torch.Size([1, 512, 192, 192])
-        torch.Size([1, 256, 384, 384])
-        torch.Size([1, 256, 768, 768])
+        >>> output.last_hidden_state.shape
+        torch.Size([1, 35, 577, 1024])
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 243135880028d09441fb41440f760a9a2c329a33 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:48:36 +0500
Subject: [PATCH 62/72] fix ruff

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8f1609b6fb1514..bd6c811a1163b0 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1043,9 +1043,7 @@ def forward(
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = (
-            image_encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
@@ -1434,9 +1432,7 @@ def forward(
         )
 
         # a. extract hidden_state
-        hidden_state = (
-            encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
         # extra step
         hidden_state = self.encoder_neck(hidden_state)
         # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)

From d9d3a49906bab33156ab97f8ebb7b2bd87d45a49 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 10:23:09 +0500
Subject: [PATCH 63/72] fix copyright license for all files

---
 docs/source/en/model_doc/depth_pro.md                           | 2 +-
 src/transformers/models/depth_pro/__init__.py                   | 2 +-
 src/transformers/models/depth_pro/configuration_depth_pro.py    | 2 +-
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py         | 2 +-
 src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +-
 .../models/depth_pro/image_processing_depth_pro_fast.py         | 2 +-
 src/transformers/models/depth_pro/modeling_depth_pro.py         | 2 +-
 tests/models/depth_pro/test_image_processing_depth_pro.py       | 2 +-
 tests/models/depth_pro/test_modeling_depth_pro.py               | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 041c4d49dffc93..9019547434af84 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py
index 1f2a6646c5c07f..6fa380d6420834 100644
--- a/src/transformers/models/depth_pro/__init__.py
+++ b/src/transformers/models/depth_pro/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 46220a0731e6f7..a93c65a79969a9 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index cd06a99c5fb2b4..90d6c5de9ccadf 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 228c3d992457e4..fb31e9084abe80 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 374d5c25cafc9e..12e56d69dfffd5 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bd6c811a1163b0..bc946dceb3143d 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index eea9ed01378db9..e9d94151e145ec 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index a3026801d59379..c6a22e90e4ac83 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 8f4c61f324e9c2794ea97d3865d249a28a42213a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 10:29:11 +0500
Subject: [PATCH 64/72] add __all__ for each file

---
 src/transformers/models/depth_pro/configuration_depth_pro.py    | 2 ++
 src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 ++
 .../models/depth_pro/image_processing_depth_pro_fast.py         | 2 ++
 src/transformers/models/depth_pro/modeling_depth_pro.py         | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index a93c65a79969a9..7f0b54ef4f4b00 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -168,3 +168,5 @@ def __init__(
         self.scaled_images_ratios = scaled_images_ratios
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
+
+__all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index fb31e9084abe80..24104c0d5cbc14 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -407,3 +407,5 @@ def post_process_depth_estimation(
             outputs["predicted_depth"].append(predicted_depth)
 
         return outputs
+
+__all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 12e56d69dfffd5..55f987aa48b766 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -380,3 +380,5 @@ def post_process_depth_estimation(
             outputs["predicted_depth"].append(predicted_depth)
 
         return outputs
+
+__all__ = ["DepthProImageProcessorFast"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bc946dceb3143d..44ace2808039ec 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1674,3 +1674,5 @@ def forward(
             hidden_states=depth_pro_outputs.hidden_states,
             attentions=depth_pro_outputs.attentions,
         )
+
+__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]

From 8960535068223dce8b65f75a98a1a557cbc3b31b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 12:22:33 +0500
Subject: [PATCH 65/72] minor fixes - fix download spell - add push_to_hub
 option - fix Optional type hinting - apply single loop for
 DepthProImageProcessor.preprocess

---
 .../depth_pro/configuration_depth_pro.py      |  1 +
 .../convert_depth_pro_weights_to_hf.py        | 27 ++++++--
 .../depth_pro/image_processing_depth_pro.py   | 61 +++++++++----------
 .../image_processing_depth_pro_fast.py        |  5 +-
 .../models/depth_pro/modeling_depth_pro.py    |  1 +
 5 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 7f0b54ef4f4b00..402811789ee4f5 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -169,4 +169,5 @@ def __init__(
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
 
+
 __all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 90d6c5de9ccadf..f4895f7730c1e6 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -182,7 +182,7 @@ def write_model(
     # Convert weights
     # ------------------------------------------------------------
 
-    # downlaod and load state_dict from hf repo
+    # download and load state_dict from hf repo
     file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
     # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
     loaded = torch.load(file_path, weights_only=True)
@@ -214,8 +214,9 @@ def write_model(
     # Safety check: reload the converted model
     gc.collect()
     print("Reloading the model to check if it's saved correctly.")
-    DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+    model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto")
     print("Model reloaded successfully.")
+    return model
 
 
 def write_image_processor(output_dir: str):
@@ -231,6 +232,7 @@ def write_image_processor(output_dir: str):
         image_std=0.5,
     )
     image_processor.save_pretrained(output_dir)
+    return image_processor
 
 
 def main():
@@ -243,23 +245,38 @@ def main():
     parser.add_argument(
         "--output_dir",
         default="apple_DepthPro",
-        help="Location to write HF model and processor",
+        help="Location to write the converted model and processor",
     )
     parser.add_argument(
         "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
     )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        help="Whether or not to push the converted model to the huggingface hub.",
+    )
+    parser.add_argument(
+        "--hub_repo_id",
+        default="geetu040/DepthPro",
+        help="Huggingface hub repo to write the converted model and processor",
+    )
     args = parser.parse_args()
 
-    write_model(
+    model = write_model(
         hf_repo_id=args.hf_repo_id,
         output_dir=args.output_dir,
         safe_serialization=args.safe_serialization,
     )
 
-    write_image_processor(
+    image_processor = write_image_processor(
         output_dir=args.output_dir,
     )
 
+    if args.push_to_hub:
+        model.push_to_hub(args.hub_repo_id)
+        image_processor.push_to_hub(args.hub_repo_id)
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 24104c0d5cbc14..a2d7f861ca6e17 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -157,6 +157,11 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
+        # we use torch interpolation instead of image.resize because DepthProImageProcessor
+        # rescales, then normalizes, which may cause some values to become negative, before resizing the image.
+        # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
+        # however pytorch interpolation works with negative values.
+        # relevant issue here: https://github.com/huggingface/transformers/issues/34920
         return (
             torch.nn.functional.interpolate(
                 # input should be (B, C, H, W)
@@ -182,9 +187,6 @@ def _validate_input_arguments(
         image_std: Union[float, List[float]],
         data_format: Union[str, ChannelDimension],
     ):
-        if data_format != ChannelDimension.FIRST:
-            raise ValueError("Only channel first data format is currently supported.")
-
         if do_resize and None in (size, resample, antialias):
             raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
 
@@ -199,8 +201,8 @@ def preprocess(
         self,
         images: ImageInput,
         do_resize: Optional[bool] = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
         antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -302,36 +304,28 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
-
-        # depth-pro scales the image before resizing it
-        # uses torch interpolation which requires ChannelDimension.FIRST
-        if do_resize:
-            images = [
-                self.resize(
-                    image=image,
-                    size=size,
-                    resample=resample,
-                    antialias=antialias,
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
                 )
-                for image in images
-            ]
 
-        data = {"pixel_values": images}
+            # depth-pro rescales and normalizes the image before resizing it
+            # uses torch interpolation which requires ChannelDimension.FIRST
+            if do_resize:
+                image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                image = self.resize(image=image, size=size, resample=resample, antialias=antialias)
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
+            else:
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+            all_images.append(image)
+
+        data = {"pixel_values": all_images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_depth_estimation(
@@ -408,4 +402,5 @@ def post_process_depth_estimation(
 
         return outputs
 
+
 __all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 55f987aa48b766..2975cb04bb915d 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -202,8 +202,8 @@ def preprocess(
         self,
         images: ImageInput,
         do_resize: Optional[bool] = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
         antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -381,4 +381,5 @@ def post_process_depth_estimation(
 
         return outputs
 
+
 __all__ = ["DepthProImageProcessorFast"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 44ace2808039ec..31490868f20704 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1675,4 +1675,5 @@ def forward(
             attentions=depth_pro_outputs.attentions,
         )
 
+
 __all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]

From 1ac1b84b391058434cd7a3e0e10f560d3881d532 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 22:30:45 +0500
Subject: [PATCH 66/72] return list in post_process_depth_estimation

---
 .../depth_pro/image_processing_depth_pro.py   | 65 +++++++++--------
 .../image_processing_depth_pro_fast.py        | 70 +++++++++----------
 2 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index a2d7f861ca6e17..f80d70c934b918 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,10 +14,13 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthProDepthEstimatorOutput
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
@@ -330,8 +333,7 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths: Union[TensorType, List[TensorType]],
-        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        outputs: "DepthProDepthEstimatorOutput",
         target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
@@ -340,22 +342,16 @@ def post_process_depth_estimation(
         and adjusts depth values accordingly.
 
         Args:
-            predicted_depths (`Union[TensorType, List[TensorType]]`):
-                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
-                corresponding to an image in the batch.
-            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
-                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
-                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
             target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
                 Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
         Returns:
-            `Dict[str, List[TensorType]]`:
-                A dictionary containing:
-                    - `"predicted_depth"`: A list of processed depth tensors.
-                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
 
         Raises:
             `ValueError`:
@@ -363,44 +359,47 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
-            raise ValueError(
-                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
-            )
-        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
-
-        fovs = [None] * len(predicted_depths) if fovs is None else fovs
-        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
-
-        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
             if target_size is not None:
                 # scale image w.r.t fov
-                if fov is not None:
+                if fov_value is not None:
                     width = target_size[1]
-                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
-                    predicted_depth = predicted_depth * width / fov
-                    outputs["fov"].append(fov)
+                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / fov_value
 
                 # interpolate
-                predicted_depth = torch.nn.functional.interpolate(
+                depth = torch.nn.functional.interpolate(
                     # input should be (B, C, H, W)
-                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
-            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
 
-            outputs["predicted_depth"].append(predicted_depth)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "fov": fov_value,
+                }
+            )
 
-        return outputs
+        return results
 
 
 __all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 2975cb04bb915d..637594f594d488 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,10 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, Union
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthProDepthEstimatorOutput
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -307,11 +310,11 @@ def preprocess(
         data = {"pixel_values": torch.stack(transformed_images, dim=0)}
         return BatchFeature(data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation
     def post_process_depth_estimation(
         self,
-        predicted_depths: Union[TensorType, List[TensorType]],
-        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
-        target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
         Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
@@ -319,22 +322,16 @@ def post_process_depth_estimation(
         and adjusts depth values accordingly.
 
         Args:
-            predicted_depths (`Union[TensorType, List[TensorType]]`):
-                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
-                corresponding to an image in the batch.
-            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
-                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
-                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
-            target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
                 Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
         Returns:
-            `Dict[str, List[TensorType]]`:
-                A dictionary containing:
-                    - `"predicted_depth"`: A list of processed depth tensors.
-                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
 
         Raises:
             `ValueError`:
@@ -342,44 +339,47 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
-            raise ValueError(
-                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
-            )
-        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
-
-        fovs = [None] * len(predicted_depths) if fovs is None else fovs
-        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
-
-        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
             if target_size is not None:
                 # scale image w.r.t fov
-                if fov is not None:
+                if fov_value is not None:
                     width = target_size[1]
-                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
-                    predicted_depth = predicted_depth * width / fov
-                    outputs["fov"].append(fov)
+                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / fov_value
 
                 # interpolate
-                predicted_depth = torch.nn.functional.interpolate(
+                depth = torch.nn.functional.interpolate(
                     # input should be (B, C, H, W)
-                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
-            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
 
-            outputs["predicted_depth"].append(predicted_depth)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "fov": fov_value,
+                }
+            )
 
-        return outputs
+        return results
 
 
 __all__ = ["DepthProImageProcessorFast"]

From 27bff69e0e5f886409b3ea3bd3200a53a1f90a00 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 23:50:38 +0500
Subject: [PATCH 67/72] minor fixes - capitalize start of docstring - use
 ignore copy - fix examples - move docstring templates and custom output
 classes to top - remove "-> None" typehinting from __init__ - type hinting
 for forward passes - fix docstrings for custom output classes

---
 .../depth_pro/image_processing_depth_pro.py   |   2 +-
 .../image_processing_depth_pro_fast.py        |   2 +-
 .../models/depth_pro/modeling_depth_pro.py    | 303 +++++++++---------
 3 files changed, 153 insertions(+), 154 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index f80d70c934b918..158dd08270c663 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -100,7 +100,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         **kwargs,
-    ) -> None:
+    ):
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 1536, "width": 1536}
         size = get_size_dict(size)
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 637594f594d488..8d625ebc0c0faf 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -110,7 +110,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         **kwargs,
-    ) -> None:
+    ):
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 1536, "width": 1536}
         size = get_size_dict(size)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 31490868f20704..e23cfbdc9f5004 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -42,9 +42,117 @@
 _CONFIG_FOR_DOC = "DepthProConfig"
 
 
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+"""
+
+
+@dataclass
+class DepthProOutput(ModelOutput):
+    """
+    Base class for DepthPro's outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_patches, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        features (`List[torch.FloatTensor]`, *optional*:
+            Features from scaled images and hidden_states.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    features: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class DepthProDepthEstimatorOutput(ModelOutput):
+    """
+    Base class for DepthProForDepthEstimation's output.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+            Field of View Scaler.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: torch.FloatTensor = None
+    fov: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     """
-    converts tensor from shape:
+    Converts tensor from shape:
     (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
     """
     data = data.reshape(-1, batch_size, *data.shape[1:])
@@ -54,7 +162,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
 
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
-    converts tensor from shape:
+    Converts tensor from shape:
     (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
     """
     data = data.transpose(0, 1)
@@ -62,12 +170,16 @@ def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     return data
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
     """
-    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
-    with addition of config parameter patch_embeddings_size
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
     """
 
+    # Ignore copy
+    # addition of config parameter patch_embeddings_size
     def __init__(self, config):
         super().__init__()
 
@@ -84,7 +196,6 @@ def __init__(self, config):
             stride=(self.patch_embeddings_size, self.patch_embeddings_size),
         )
 
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
         if num_channels != self.num_channels:
@@ -103,7 +214,7 @@ class DepthProViTEmbeddings(nn.Module):
     and enabling dynamic embeddings.
     """
 
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
 
         self.config = config
@@ -179,8 +290,8 @@ def forward(
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSelfAttention(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -199,14 +310,13 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of parameter batch_size
     def forward(
         self,
         hidden_states,
@@ -253,14 +363,14 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
 
-    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states,
@@ -323,15 +433,14 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.attention = DepthProViTSelfAttention(config)
         self.output = DepthProViTSelfOutput(config)
         self.pruned_heads = set()
 
-    # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
     def prune_heads(self, heads: Set[int]) -> None:
         if len(heads) == 0:
             return
@@ -350,8 +459,8 @@ def prune_heads(self, heads: Set[int]) -> None:
         self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -464,10 +573,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
@@ -484,8 +593,8 @@ def __init__(self, config: DepthProConfig) -> None:
             self.mlp = DepthProViTMLP(config)
         self.layer_scale2 = DepthProViTLayerScale(config)
 
-    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -526,16 +635,16 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
-    # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -643,7 +752,7 @@ def __init__(
         n_upsample_layers,
         use_proj=True,
         bias=False,
-    ) -> None:
+    ):
         super().__init__()
 
         # create first projection block
@@ -768,37 +877,8 @@ def merge(patches, batch_size, merge_out_size):
     return boxes
 
 
-@dataclass
-class DepthProOutput(ModelOutput):
-    """
-    Base class for DepthPro's outputs.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        features (`List[torch.FloatTensor]`, *optional*:
-            Features from scaled images and hidden_states.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    features: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-
-
 class DepthProEncoder(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1112,40 +1192,6 @@ def _init_weights(self, module):
             module.weight.data.fill_(1.0)
 
 
-DEPTH_PRO_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEPTH_PRO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
-            for details.
-
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 @add_start_docstrings(
     "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
     DEPTH_PRO_START_DOCSTRING,
@@ -1190,20 +1236,20 @@ def forward(
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, DepthProModel
-        >>>
+
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>>
+
         >>> checkpoint = "geetu040/DepthPro"
         >>> processor = AutoProcessor.from_pretrained(checkpoint)
         >>> model = DepthProModel.from_pretrained(checkpoint)
-        >>>
+
         >>> # prepare image for the model
         >>> inputs = processor(images=image, return_tensors="pt")
-        >>>
+
         >>> with torch.no_grad():
         ...     output = model(**inputs)
-        ...
+
         >>> output.last_hidden_state.shape
         torch.Size([1, 35, 577, 1024])
         ```"""
@@ -1296,7 +1342,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
 # except it uses deconv annd skip_add
 class DepthProFeatureFusionLayer(nn.Module):
-    def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True):
         super().__init__()
         self.config = config
         self.use_deconv = use_deconv
@@ -1317,7 +1363,7 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
         self.skip_add = nn.quantized.FloatFunctional()
 
-    def forward(self, hidden_state, residual=None):
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         if residual is not None:
             if hidden_state.shape != residual.shape:
                 residual = nn.functional.interpolate(
@@ -1345,7 +1391,7 @@ def __init__(self, config, num_layers):
         # final layer doesnot require deconvolution
         self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         if self.num_layers != len(hidden_states):
             raise ValueError(
                 f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
@@ -1366,7 +1412,7 @@ def forward(self, hidden_states):
 
 
 class DepthProFOVModel(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1487,53 +1533,6 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         return predicted_depth
 
 
-@dataclass
-class DepthProDepthEstimatorOutput(ModelOutput):
-    """
-    Base class for DepthProForDepthEstimation's output.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
-            Predicted depth for each pixel.
-        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
-            Field of View Scaler.
-
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    predicted_depth: torch.FloatTensor = None
-    fov: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-
-
-DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-        use_fov_model (`bool`, *optional*, defaults to `True`):
-            Whether to use `DepthProFOVModel` to generate the field of view.
-"""
-
-
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
@@ -1605,27 +1604,27 @@ def forward(
         >>> import torch
         >>> from PIL import Image
         >>> import requests
-        >>>
+
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>>
+
         >>> checkpoint = "geetu040/DepthPro"
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
-        >>>
+
         >>> # prepare image for the model
         >>> inputs = processor(images=image, return_tensors="pt")
-        >>>
+
         >>> with torch.no_grad():
         ...     outputs = model(**inputs)
-        ...
+
         >>> # interpolate to original size
         >>> post_processed_output = processor.post_process_depth_estimation(
-        ...     outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)],
+        ...     outputs, target_sizes=[(image.height, image.width)],
         ... )
-        >>>
+
         >>> # visualize the prediction
-        >>> predicted_depth = post_processed_output["predicted_depth"][0]
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
         >>> depth = predicted_depth * 255 / predicted_depth.max()
         >>> depth = depth.detach().cpu().numpy()
         >>> depth = Image.fromarray(depth.astype("uint8"))

From a69b5afbf98e5c3e3d4c93b230cfb24419bdfda9 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 23:56:42 +0500
Subject: [PATCH 68/72] fix "ruff check"

---
 .../models/depth_pro/image_processing_depth_pro.py             | 1 +
 .../models/depth_pro/image_processing_depth_pro_fast.py        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 158dd08270c663..76a12577dd6330 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthProDepthEstimatorOutput
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 8d625ebc0c0faf..521e5b8a06282e 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,8 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
 
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthProDepthEstimatorOutput

From 365a71df0d15e32713d9ef3d8063e2b241d33219 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 22 Dec 2024 02:44:27 +0500
Subject: [PATCH 69/72] update upsample and projection

---
 .../convert_depth_pro_weights_to_hf.py        |  33 +--
 .../models/depth_pro/modeling_depth_pro.py    | 220 ++++++++++--------
 2 files changed, 138 insertions(+), 115 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index f4895f7730c1e6..15c063ca377a00 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -59,25 +59,25 @@
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
     # upsamples (hard coded; regex is not very feasible here)
-    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.0.weight",
+    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.1.weight",
+    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.2.weight",
+    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.3.weight",
+    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.0.weight",
+    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.1.weight",
+    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.2.weight",
+    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.0.weight",
+    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.1.weight",
+    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.0.weight",
+    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.1.weight",
+    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.0.weight",
+    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.1.weight",
+    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.weight",
+    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.bias",
 
     # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (
-        f"projections.{4-int(match.group(1))}.weight"
+        f"depth_pro.encoder.feature_projection.projections.{4-int(match.group(1))}.weight"
     ),
 
     # fusion stage
@@ -274,6 +274,7 @@ def main():
     )
 
     if args.push_to_hub:
+        print("Pushing to hub...")
         model.push_to_hub(args.hub_repo_id)
         image_processor.push_to_hub(args.hub_repo_id)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index e23cfbdc9f5004..c24ffce7bf93e4 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -743,21 +743,60 @@ def forward(
         )
 
 
-class DepthProUpsampleBlock(nn.Module):
-    def __init__(
-        self,
-        input_dims,
-        intermediate_dims,
-        output_dims,
-        n_upsample_layers,
-        use_proj=True,
-        bias=False,
-    ):
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
         super().__init__()
+        self.config = config
+
+        self.upsample_blocks = nn.ModuleList()
+
+        # for image_features
+        self.upsample_blocks.append(
+            self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=config.hidden_size,
+                output_dims=config.scaled_images_feature_dims[0],
+                n_upsample_layers=1,
+                use_proj=False,
+                bias=True,
+            )
+        )
+
+        # for scaled_images_features
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            upsample_block = self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.upsample_blocks.append(upsample_block)
+
+        # for intermediate_features
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            upsample_block = self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.upsample_blocks.append(upsample_block)
 
-        # create first projection block
+    def _create_upsample_block(
+        self,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ) -> nn.Module:
+        upsample_block = nn.Sequential()
+
+        # create first projection layer
         if use_proj:
-            self.proj = nn.Conv2d(
+            proj = nn.Conv2d(
                 in_channels=input_dims,
                 out_channels=intermediate_dims,
                 kernel_size=1,
@@ -765,11 +804,9 @@ def __init__(
                 padding=0,
                 bias=bias,
             )
-        else:
-            self.proj = nn.Identity()
+            upsample_block.append(proj)
 
-        # create following upsample blocks
-        self.upsample_blocks = nn.Sequential()
+        # create following upsample layers
         for i in range(n_upsample_layers):
             in_channels = intermediate_dims if i == 0 else output_dims
             layer = nn.ConvTranspose2d(
@@ -780,11 +817,47 @@ def __init__(
                 padding=0,
                 bias=bias,
             )
-            self.upsample_blocks.append(layer)
+            upsample_block.append(layer)
+
+        return upsample_block
+
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        upsampled_features = []
+        for i, upsample_block in enumerate(self.upsample_blocks):
+            upsampled_feature = upsample_block(features[i])
+            upsampled_features.append(upsampled_feature)
+        return upsampled_features
+
+
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
 
-    def forward(self, features):
-        projected = self.proj(features)
-        return self.upsample_blocks(projected)
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
 
 
 def interpolate(pixel_values, scale_factor):
@@ -944,38 +1017,8 @@ def __init__(self, config: DepthProConfig):
         # image encoder
         self.image_encoder = DepthProViT(config)
 
-        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
-        self.upsample_scaled_images = nn.ModuleList()
-        for i, feature_dims in enumerate(self.scaled_images_feature_dims):
-            upsample_block = DepthProUpsampleBlock(
-                input_dims=config.hidden_size,
-                intermediate_dims=feature_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=1,
-            )
-            self.upsample_scaled_images.append(upsample_block)
-
-        # upsampling intermediate features - (1-2) in diagram
-        self.upsample_intermediate = nn.ModuleList()
-        for i, feature_dims in enumerate(self.intermediate_feature_dims):
-            intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims
-            upsample_block = DepthProUpsampleBlock(
-                input_dims=config.hidden_size,
-                intermediate_dims=intermediate_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=2 + i,
-            )
-            self.upsample_intermediate.append(upsample_block)
-
-        # upsampling image features - (6) in diagram
-        self.upsample_image = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.hidden_size,
-            output_dims=config.scaled_images_feature_dims[0],
-            n_upsample_layers=1,
-            use_proj=False,
-            bias=True,
-        )
+        # upsample features
+        self.feature_upsample = DepthProFeatureUpsample(config)
 
         # for STEP 7: fuse low_res and image features
         self.fuse_image_with_low_res = nn.Conv2d(
@@ -987,6 +1030,9 @@ def __init__(self, config: DepthProConfig):
             bias=True,
         )
 
+        # project features
+        self.feature_projection = DepthProFeatureProjection(config)
+
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -1079,10 +1125,6 @@ def forward(
                 features, batch_size=B, merge_out_size=self.out_size * 2**i
             )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
-            # d. upsample
-            features = self.upsample_scaled_images[i](features)
-            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
-
             scaled_images_features.append(features)
 
         # STEP 5: get intermediate features - (1-2) in diagram
@@ -1114,10 +1156,6 @@ def forward(
                 merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
             )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
-            # d. upsample
-            features = self.upsample_intermediate[i](features)
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
-
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
@@ -1133,24 +1171,30 @@ def forward(
         # c. merge patches back together
         # no merge required for image_features as they are already in batches instead of patches
 
-        # d. upsample
-        image_features = self.upsample_image(
-            image_features
-        )  # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
-
-        # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
-        # fuses image_features with lowest resolution features as they are of same size
-        scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
-        scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
-
-        # STEP 8: return these features in order of increasing size as what fusion expects
+        # STEP 7: combine all features
         features = [
+            image_features,
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
 
+        # STEP 8: upsample features
+        features = self.feature_upsample(features)
+
+        # STEP 9: apply fusion
+        # (global features = low res features + image features)
+        # fuses image_features with lowest resolution features as they are of same size
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+
+        # STEP 10: project features
+        features = self.feature_projection(features)
+
+        # STEP 11: return output
+
         last_hidden_state = patch_encodings.last_hidden_state
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
@@ -1380,11 +1424,13 @@ def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] =
 
 
 # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
-# with num_layers, deconv and reversed layers
+# with deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
-    def __init__(self, config, num_layers):
+    def __init__(self, config):
         super().__init__()
-        self.num_layers = num_layers
+        self.config = config
+
+        self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
         self.layers = nn.ModuleList()
         for _ in range(self.num_layers - 1):
             self.layers.append(DepthProFeatureFusionLayer(config))
@@ -1491,9 +1537,6 @@ def forward(
         # c. merge patches back together
         # no merge required for fov_features as they are already in batches instead of patches
 
-        # d. upsample
-        # no upsampling required for fov_features, the head later downsamples to create scalars
-
         global_features = self.global_neck(global_features)
 
         fov_features = fov_features + global_features
@@ -1548,28 +1591,8 @@ def __init__(self, config, use_fov_model=None):
         # dinov2 (vit) like encoders
         self.depth_pro = DepthProModel(config)
 
-        # project hidden states from encoder to match expected inputs in fusion stage
-        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
-        self.projections = nn.ModuleList()
-        for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
-                # projection for last layer can be ignored if input and output channels already match
-                self.projections.append(nn.Identity())
-            else:
-                self.projections.append(
-                    nn.Conv2d(
-                        in_channels=in_channels,
-                        out_channels=config.fusion_hidden_size,
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        bias=False,
-                    )
-                )
-
         # dpt (vit) like fusion stage
-        self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers)
+        self.fusion_stage = DepthProFeatureFusionStage(config)
 
         # depth estimation head
         self.head = DepthProDepthEstimationHead(config)
@@ -1647,7 +1670,6 @@ def forward(
             return_dict=True,
         )
         features = depth_pro_outputs.features
-        features = [proj(feature) for proj, feature in zip(self.projections, features)]
         fused_hidden_states = self.fusion_stage(features)
         predicted_depth = self.head(fused_hidden_states[-1])
 

From c00946819e1d172f78a0b023bee6a413cd065109 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 11:28:35 +0500
Subject: [PATCH 70/72] major changes: (image size and merge optimization) -
 add support for images of any size - optimize merge operation - remove
 image_size from config - use full names instead of B, C, H, W - remove
 interpolation from fusion stage - add interpolation after merge - move
 validations to config - update integration test - add type hints for
 functions

---
 .../depth_pro/configuration_depth_pro.py      |  44 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 314 +++++++++---------
 .../depth_pro/test_modeling_depth_pro.py      |  11 +-
 3 files changed, 190 insertions(+), 179 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 402811789ee4f5..206c01eff191bd 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -53,11 +53,6 @@ class DepthProConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 1536):
-            The size (resolution) of each image,
-            To generate depth of same size as image,
-            image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
-            where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
@@ -120,7 +115,6 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=1536,
         patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,
@@ -141,6 +135,43 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
+        # scaled_images_ratios is sorted
+        if scaled_images_ratios != sorted(scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
+            )
+
+        # patch_size should be a divisible by patch_embeddings_size
+        # else it raises an exception in DepthProViTPatchEmbeddings
+        if patch_size % patch_embeddings_size != 0:
+            raise ValueError(
+                f"patch_size={patch_size} should be divisible " f"by patch_embeddings_size={patch_embeddings_size}."
+            )
+
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
+        if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # intermediate_hook_ids, intermediate_feature_dims should be consistent
+        if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # fusion_hidden_size should be consistent with num_fov_head_layers
+        if fusion_hidden_size // 2**num_fov_head_layers == 0:
+            raise ValueError(
+                f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} "
+                "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
+            )
+
         self.hidden_size = hidden_size
         self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -151,7 +182,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c24ffce7bf93e4..633d765b49f3f0 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -95,17 +95,17 @@ class DepthProOutput(ModelOutput):
     Base class for DepthPro's outputs.
 
     Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_patches, sequence_length, hidden_size)`):
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         features (`List[torch.FloatTensor]`, *optional*:
             Features from scaled images and hidden_states.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
             sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
@@ -132,11 +132,11 @@ class DepthProDepthEstimatorOutput(ModelOutput):
             Field of View Scaler.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
             sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
@@ -153,7 +153,7 @@ class DepthProDepthEstimatorOutput(ModelOutput):
 def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     """
     Converts tensor from shape:
-    (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
+    (num_patches, seq_len, hidden_size) -> (batch_size, n_patches_per_batch, seq_len, hidden_size)
     """
     data = data.reshape(-1, batch_size, *data.shape[1:])
     data = data.transpose(0, 1)
@@ -163,7 +163,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
     Converts tensor from shape:
-    (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
+    (batch_size, n_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
     """
     data = data.transpose(0, 1)
     data = data.reshape(-1, *data.shape[2:])
@@ -860,94 +860,100 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
         return projected_features
 
 
-def interpolate(pixel_values, scale_factor):
+def interpolate(
+    pixel_values: torch.Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None
+) -> torch.Tensor:
     return nn.functional.interpolate(
         pixel_values,
-        size=None,
+        size=size,
         scale_factor=scale_factor,
         mode="bilinear",
         align_corners=False,
     )
 
 
-def patch(pixel_values, patch_size, overlap_ratio):
+def patch(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
     """Creates Patches from Batch."""
-    B, C, W, H = pixel_values.shape
+    batch_size, num_channels, height, width = pixel_values.shape
 
-    if W == H == patch_size:
+    if height == width == patch_size:
         # create patches only if scaled image is not already equal to patch size
         return pixel_values
 
     stride = int(patch_size * (1 - overlap_ratio))
 
-    # (B, C, W, H)
+    # (batch_size, num_channels, height, width)
     patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
-    # patches.shape (B, patch_size**2 * C, num_patches)
+    # patches.shape (batch_size, patch_size**2 * num_channels, n_patches_per_batch)
     patches = patches.permute(2, 0, 1)
-    # patches.shape (num_patches, B, patch_size**2 * C)
-    patches = patches.reshape(-1, C, patch_size, patch_size)
-    # patches.shape (B * num_patches, C, patch_size, patch_size)
+    # patches.shape (n_patches_per_batch, batch_size, patch_size**2 * C)
+    patches = patches.reshape(-1, num_channels, patch_size, patch_size)
+    # patches.shape (n_patches, num_channels, patch_size, patch_size)
 
     return patches
 
 
-def reshape_feature(hidden_states, width, height):
+def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
-    B, _, C = hidden_states.shape
-    # (B, WH+1, C)
+    n_samples, seq_len, hidden_size = hidden_states.shape
+    size = int(math.sqrt(seq_len))
+
+    # (n_samples, seq_len, hidden_size)
     hidden_states = hidden_states[:, 1:, :]  # remove class token
-    # (B, WH, C)
-    hidden_states = hidden_states.reshape(B, width, height, C)
-    # (B, W, H, C)
+    # (n_samples, seq_len, hidden_size)
+    hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
+    # (n_samples, size, size, hideden_size)
     hidden_states = hidden_states.permute(0, 3, 1, 2)
-    # (B, C, W, H)
+    # (n_samples, hideden_size, size, size)
     return hidden_states
 
 
-def merge(patches, batch_size, merge_out_size):
-    """Recreates Batch from Patches."""
-    num_patches, num_channels, out_size, out_size = patches.shape
+def merge(patches: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
+    n_patches, hidden_size, out_size, out_size = patches.shape
+    n_patches_per_batch = n_patches // batch_size
+    sqrt_n_patches_per_batch = int(math.sqrt(n_patches_per_batch))
+    new_out_size = sqrt_n_patches_per_batch * out_size
 
-    if num_patches == batch_size:
+    if n_patches == batch_size:
         # merge only if the patches were created from scaled image
         # patches are not created when scaled image size is equal to patch size
         return patches
 
-    box_size = math.ceil(math.sqrt(num_patches // batch_size))
-    """
-    merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
-    padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
-    """
-    padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2)
-
-    i = 0
-    boxes = []
-    for h in range(box_size):
-        boxes_in_row = []
-        for w in range(box_size):
-            box = patches[batch_size * i : batch_size * (i + 1)]
-
-            if h != 0:
-                # remove pad from height if box is not at top border
-                box = box[..., padding:, :]
-            if w != 0:
-                # remove pad from width if box is not at left border
-                box = box[..., :, padding:]
-            if h != box_size - 1:
-                # remove pad from height if box is not at bottom border
-                box = box[..., : box.shape[-2] - padding, :]
-            if w != box_size - 1:
-                # remove pad from width if box is not at right border
-                box = box[..., :, : box.shape[-1] - padding]
-
-            boxes_in_row.append(box)
-            i += 1
-
-        boxes_in_row = torch.cat(boxes_in_row, dim=-1)
-        boxes.append(boxes_in_row)
-
-    boxes = torch.cat(boxes, dim=-2)
-    return boxes
+    # calculate padding using the formula
+    # merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+    padding = (sqrt_n_patches_per_batch * out_size - merge_out_size) // (2 * sqrt_n_patches_per_batch - 2)
+
+    # patches.shape (n_patches, hidden_size, out_size, out_size)
+
+    merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+    # (n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+    merged = merged.permute(1, 2, 0, 3, 4)
+    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
+
+    merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
+
+    merged = merged.reshape(
+        batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+    )
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size)
+    merged = merged.permute(0, 1, 2, 4, 3, 5)
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch, out_size, sqrt_n_patches_per_batch, out_size)
+    merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch * out_size, sqrt_n_patches_per_batch * out_size)
+
+    if padding != 0:
+        padding_mask = torch.ones((new_out_size, new_out_size), dtype=torch.bool)
+        starting_index = torch.arange(start=out_size - padding, end=new_out_size - padding, step=out_size)
+        for index in starting_index:
+            padding_mask[index : index + padding * 2, :] = False
+            padding_mask[:, index : index + padding * 2] = False
+
+        merged = merged[:, :, padding_mask]
+        final_out_size = int(math.sqrt(merged.shape[-1]))
+        merged = merged.reshape(*merged.shape[:2], final_out_size, final_out_size)
+
+    return merged
 
 
 class DepthProEncoder(nn.Module):
@@ -968,49 +974,6 @@ def __init__(self, config: DepthProConfig):
         self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size**2  # each patch is flattened
 
-        # config.scaled_images_ratios is sorted
-        if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
-            raise ValueError(
-                f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high"
-            )
-
-        # lowest image resolution is greator than the patch_size
-        if config.scaled_images_ratios[0] * config.image_size < config.patch_size:
-            raise ValueError(
-                "Image cannot be scaled to a size less than patch_size. "
-                f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable "
-                f"to the given patch_size={config.patch_size}."
-            )
-
-        # patch_size should be a divisible by patch_embeddings_size
-        # else it raises an exception in DepthProViTPatchEmbeddings
-        if config.patch_size % config.patch_embeddings_size != 0:
-            raise ValueError(
-                f"patch_size={config.patch_size} should be divisible "
-                f"by patch_embeddings_size={config.patch_embeddings_size}."
-            )
-
-        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
-        if not (
-            len(config.scaled_images_ratios)
-            == len(config.scaled_images_overlap_ratios)
-            == len(config.scaled_images_feature_dims)
-        ):
-            raise ValueError(
-                f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
-                f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "
-                f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, "
-                f"should match in config."
-            )
-
-        # intermediate_hook_ids, intermediate_feature_dims are consistent
-        if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)):
-            raise ValueError(
-                f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and "
-                f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, "
-                f"should match in config."
-            )
-
         # patch encoder
         self.patch_encoder = DepthProViT(config)
 
@@ -1048,23 +1011,30 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values.dim() != 4:
-            raise ValueError("Input tensor must have shape (B, C, H, W).")
+            raise ValueError("Input tensor must have shape (batch_size, num_channels, height, width).")
 
-        B, C, H, W = pixel_values.shape
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        if not (num_channels == self.config.num_channels):
+            raise ValueError(
+                f"Found {num_channels} channels in image, expected number of channels is {self.config.num_channels} from config."
+            )
 
-        if not (C == self.config.num_channels):
+        if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
             raise ValueError(
-                f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."
+                f"Image size {height}x{width} is too small to be scaled "
+                f"with scaled_images_ratios={self.scaled_images_ratios} "
+                f"when patch_size={self.config.patch_size}."
             )
 
-        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
+        # pixel_values.shape (batch_size, num_channels, height, width)
 
         # STEP 1: create 3-level image
 
         scaled_images = []
         for ratio in self.scaled_images_ratios:
-            scaled_images.append(interpolate(pixel_values, ratio))
-            # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
+            scaled_images.append(interpolate(pixel_values, scale_factor=ratio))
+            # (batch_size, num_channels, height*ratio, width*ratio)
 
         # STEP 2: create patches
 
@@ -1074,9 +1044,10 @@ def forward(
                 patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
-        scaled_images_num_patches = [len(i) for i in scaled_images]
+            # (n_patches_per_scaled_image[i], num_channels, patch_size, patch_size)
+        n_patches_per_scaled_image = [len(i) for i in scaled_images]
         patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
-        # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
+        # (n_patches, num_channels, patch_size, patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -1087,43 +1058,59 @@ def forward(
             # required for intermediate features
             output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
             return_dict=True,
-            batch_size=B,
+            batch_size=batch_size,
         )
+        # patch_encodings.last_hidden_state (batch_size, n_patches/batch_size, seq_len, hidden_size)
+        # patch_encodings.hidden_states[i]  (batch_size, n_patches/batch_size, seq_len, hidden_size)
+        # patch_encodings.attentions[i]     (batch_size, n_patches/batch_size, num_heads, seq_len, seq_len)
+
         last_hidden_state = patch_encodings.last_hidden_state
+        # (batch_size, n_patches/batch_size, seq_len, hidden_size)
         last_hidden_state = batch_to_patch(last_hidden_state)
-        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1])
+        # (n_patches, seq_len, hidden_size)
+        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, n_patches_per_scaled_image[::-1])
+        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
-        # -1 as patch encoder expects high res patches first
+        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
+        # -1 (reverse list) as patch encoder expects high res patches first
 
         # scale the image to patch size for image_encoder
-        image_scaled_to_patch_size = nn.functional.interpolate(
+        image_scaled_to_patch_size = interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
-            mode="bilinear",
-            align_corners=False,
         )
         image_encodings = self.image_encoder(
             pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
         )
+        # image_encodings.last_hidden_state (batch_size, seq_len, hidden_size)
+        # image_encodings.hidden_states[i]  (batch_size, seq_len, hidden_size)
+        # image_encodings.attentions[i]     (batch_size, num_heads, seq_len, seq_len)
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
+        exponent_value = int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
         scaled_images_features = []
         for i in range(self.n_scaled_images):
             # a. extract hidden_state
             hidden_state = scaled_images_last_hidden_state[i]
-            # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
+            # (n_patches_per_scaled_image[i], seq_len, hidden_size)
 
             # b. reshape back to image like
-            features = reshape_feature(
-                hidden_state, self.out_size, self.out_size
-            )  # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
+            features = reshape_feature(hidden_state)
+            # (n_patches_per_scaled_image[i], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size * 2**i
-            )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
+                features, batch_size=batch_size, merge_out_size=self.out_size * 2**i
+            )  # (batch_size, hidden_size, out_size*2**i, out_size*2**i)
+
+            # d. interpolate patches to base size
+            features = interpolate(features, size=(base_height * 2**i, base_width * 2**i))
+            # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
 
             scaled_images_features.append(features)
 
@@ -1138,46 +1125,54 @@ def forward(
             hidden_state = patch_encodings.hidden_states[layer_id]
             hidden_state = batch_to_patch(hidden_state)
             hidden_state = hidden_state[
-                : scaled_images_num_patches[-1]
-            ]  # num_patches to be of same length as highest resolution
-            # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
+                : n_patches_per_scaled_image[-1]
+            ]  # number of patches to be of same length as highest resolution
+            # (n_patches_per_scaled_image[-1], seq_len, hidden_size)
 
             # b. reshape back to image like
-            features = reshape_feature(
-                hidden_state,
-                self.out_size,
-                self.out_size,
-            )  # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
+            features = reshape_feature(hidden_state)
+            # (n_patches_per_scaled_image[-1], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge(
                 features,
-                batch_size=B,
+                batch_size=batch_size,
                 merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
-            )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+            )  # (batch_size, hidden_size, out_size*2**(n_scaled_images-1), out_size*2**(n_scaled_images-1))
+
+            # d. interpolate patches to base size
+            features = interpolate(
+                features,
+                size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
+            )
+            # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
 
         # b. reshape back to image like
-        image_features = reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        image_features = reshape_feature(hidden_state)
+        # (batch_size, hidden_size, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for image_features as they are already in batches instead of patches
 
+        # d. interpolate patches to base size
+        image_features = interpolate(image_features, size=(base_height, base_width))
+        # (batch_size, hidden_size, base_height, base_width)
+
         # STEP 7: combine all features
         features = [
             image_features,
-            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+            # (batch_size, scaled_images_feature_dims[0], base_height*2, base_width*2)
             *scaled_images_features,
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
+            # (batch_size, scaled_images_feature_dims[i], base_height*2**(i+1), base_width*2**(i+1))
             *intermediate_features,
+            # (batch_size,  intermediate_feature_dims[i], base_height*2**(n_scaled_images+i+1), base_width*2**(n_scaled_images+i+1))
         ]
 
         # STEP 8: upsample features
@@ -1384,7 +1379,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 
 
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
-# except it uses deconv annd skip_add
+# except it uses deconv and skip_add and needs no interpolation
 class DepthProFeatureFusionLayer(nn.Module):
     def __init__(self, config: DepthProConfig, use_deconv: bool = True):
         super().__init__()
@@ -1409,10 +1404,6 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True):
 
     def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         if residual is not None:
-            if hidden_state.shape != residual.shape:
-                residual = nn.functional.interpolate(
-                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
-                )
             hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
 
         hidden_state = self.residual_layer2(hidden_state)
@@ -1473,12 +1464,6 @@ def __init__(self, config: DepthProConfig):
             nn.ReLU(True),
         )
 
-        if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
-            raise ValueError(
-                f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
-                "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0"
-            )
-
         # create initial head layers
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
@@ -1507,16 +1492,14 @@ def forward(
         global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        B, C, W, H = pixel_values.shape
+        batch_size, num_channels, height, width = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
         # except for the extra encoder_neck layer applied
 
-        image_scaled_to_patch_size = nn.functional.interpolate(
+        image_scaled_to_patch_size = interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
-            mode="bilinear",
-            align_corners=False,
         )
         encodings = self.encoder(
             image_scaled_to_patch_size,
@@ -1524,24 +1507,27 @@ def forward(
         )
 
         # a. extract hidden_state
-        hidden_state = encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
         # extra step
         hidden_state = self.encoder_neck(hidden_state)
-        # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)
+        # (batch_size, seq_len, fusion_hidden_size//2)
 
         # b. reshape back to image like
-        fov_features = reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        fov_features = reshape_feature(hidden_state)
+        # (batch_size, fusion_hidden_size//2, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for fov_features as they are already in batches instead of patches
 
+        # d. interpolate patches to base size
+        # skip; instead interpolate the global features
+
         global_features = self.global_neck(global_features)
+        global_features = interpolate(global_features, size=(self.out_size, self.out_size))
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
-        fov_output = fov_output.reshape(B)
+        fov_output = fov_output.reshape(batch_size)
 
         return fov_output
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index c6a22e90e4ac83..ad17476c664dcf 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -366,15 +366,10 @@ def test_post_processing_depth_estimation(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        predicted_depth = outputs.predicted_depth
-        fov = outputs.fov
-        target_size = [[image.height, image.width]] * len(predicted_depth)
-
         outputs = image_processor.post_process_depth_estimation(
-            predicted_depths=predicted_depth,
-            fovs=fov,
-            target_sizes=target_size,
+            outputs,
+            target_sizes=[[image.height, image.width]],
         )
-        predicted_depth = outputs["predicted_depth"][0]
+        predicted_depth = outputs[0]["predicted_depth"]
         expected_shape = torch.Size((image.height, image.width))
         self.assertTrue(predicted_depth.shape == expected_shape)

From 1563f06a2af7522aa37109eda67c2701af037092 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 22:00:16 +0500
Subject: [PATCH 71/72] fix push_to_hub option in weights conversion

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 15c063ca377a00..201a4b4acc6041 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -252,8 +252,7 @@ def main():
     )
     parser.add_argument(
         "--push_to_hub",
-        default=True,
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         help="Whether or not to push the converted model to the huggingface hub.",
     )
     parser.add_argument(

From e194ae45d2d383f4e954bc265f07fb318f68652e Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 22:32:58 +0500
Subject: [PATCH 72/72] remove image_size in weights conversion

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 201a4b4acc6041..cca89f6a8b8cac 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -153,7 +153,6 @@ def write_model(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=1536,
         patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,