From 5e8d40c401df2b06b60e2f3dec13373c54d21c9d Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 22 Dec 2023 20:21:21 +0000
Subject: [PATCH 01/32] Enable instantiating model with pretrained backbone
 weights

---
 src/transformers/models/vitmatte/configuration_vitmatte.py | 3 ---
 tests/utils/test_backbone_utils.py                         | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 275640d1d079a1..9984d89e1d6e4b 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -99,10 +99,7 @@ def __init__(
 
         if use_pretrained_backbone:
             raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
-            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
-
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
             backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index cd9a5a29a8c071..1b9f49b5cfb6e1 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -31,10 +31,6 @@
     import torch
 
     from transformers import BertPreTrainedModel
-
-
-class BackboneUtilsTester(unittest.TestCase):
-    def test_get_aligned_output_features_output_indices(self):
         stage_names = ["a", "b", "c"]
 
         # Defaults to last layer if both are None

From 11ede698991169b4f27a7dfd294b97900bf11896 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 3 Jan 2024 20:40:24 +0000
Subject: [PATCH 02/32] Clarify pretrained import

---
 tests/utils/test_backbone_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 1b9f49b5cfb6e1..cd9a5a29a8c071 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -31,6 +31,10 @@
     import torch
 
     from transformers import BertPreTrainedModel
+
+
+class BackboneUtilsTester(unittest.TestCase):
+    def test_get_aligned_output_features_output_indices(self):
         stage_names = ["a", "b", "c"]
 
         # Defaults to last layer if both are None

From e1067a4f2d413cce0b0c79fa6a24e2601072fcd7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 15:17:26 +0000
Subject: [PATCH 03/32] Use load_backbone instead

---
 src/transformers/models/dpt/modeling_dpt.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index aad3330279f051..da9654c9d46178 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1079,6 +1079,8 @@ def __init__(self, config):
             self.backbone = load_backbone(config)
         else:
             self.dpt = DPTModel(config, add_pooling_layer=False)
+        else:
+            self.backbone = load_backbone(config)
 
         # Neck
         self.neck = DPTNeck(config)

From d19fc39715483ef6768205fb50a93acd8d3f459e Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 18:38:58 +0000
Subject: [PATCH 04/32] Add backbone_kwargs to config

---
 src/transformers/utils/backbone_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 14fcfe4a50a2d2..696edf6155c921 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -288,7 +288,7 @@ def to_dict(self):
         return output
 
 
-def load_backbone(config):
+def load_backbone(config, **kwargs):
     """
     Loads the backbone model from a config object.
 

From 59ba869751995e0653c2d887b398fecb5106dce3 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 31 Jan 2024 19:47:10 +0000
Subject: [PATCH 05/32] Fix up

---
 src/transformers/models/dpt/modeling_dpt.py                | 2 --
 src/transformers/models/vitmatte/configuration_vitmatte.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index da9654c9d46178..aad3330279f051 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1079,8 +1079,6 @@ def __init__(self, config):
             self.backbone = load_backbone(config)
         else:
             self.dpt = DPTModel(config, add_pooling_layer=False)
-        else:
-            self.backbone = load_backbone(config)
 
         # Neck
         self.neck = DPTNeck(config)
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 9984d89e1d6e4b..840813b9c66e4e 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -99,7 +99,6 @@ def __init__(
 
         if use_pretrained_backbone:
             raise ValueError("Pretrained backbones are not supported yet.")
-        if backbone_config is not None and backbone is not None:
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
             backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])

From 899c9bd3f8d8b04cffc04083e3d776757250a488 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 1 Feb 2024 11:19:14 +0000
Subject: [PATCH 06/32] Add tests

---
 tests/utils/test_backbone_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index cd9a5a29a8c071..244f62950f048f 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -167,7 +167,7 @@ def test_load_backbone_from_checkpoint(self):
         )
         backbone = load_backbone(config)
         # We can't know ahead of time the exact output features and indices, or the layer names before
-        # creating the timm model, so it defaults to the last layer (-1,) and has a different layer name
+        # creating the timm model, so it defalts to the last layer (-1,) and has a different layer name
         self.assertEqual(backbone.out_indices, (-1,))
         self.assertEqual(backbone.out_features, ["layer4"])
         self.assertIsInstance(backbone, TimmBackbone)

From 6fc904d5a03fe67a95ffa0f03c26f82f1396c6e2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 1 Feb 2024 15:23:09 +0000
Subject: [PATCH 07/32] Tidy up

---
 src/transformers/models/vitmatte/configuration_vitmatte.py | 4 ++++
 src/transformers/utils/backbone_utils.py                   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 840813b9c66e4e..275640d1d079a1 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -99,6 +99,10 @@ def __init__(
 
         if use_pretrained_backbone:
             raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
             backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 696edf6155c921..14fcfe4a50a2d2 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -288,7 +288,7 @@ def to_dict(self):
         return output
 
 
-def load_backbone(config, **kwargs):
+def load_backbone(config):
     """
     Loads the backbone model from a config object.
 

From ad201dd2edc960406b2acb9db0e8f2750400c7b5 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 22 Dec 2023 20:21:21 +0000
Subject: [PATCH 08/32] Enable instantiating model with pretrained backbone
 weights

---
 .../configuration_conditional_detr.py         | 11 +--
 .../configuration_deformable_detr.py          | 11 +--
 .../models/deta/configuration_deta.py         | 16 +---
 .../models/detr/configuration_detr.py         | 13 +--
 .../models/dpt/configuration_dpt.py           | 23 +----
 .../mask2former/configuration_mask2former.py  | 20 +----
 .../maskformer/configuration_maskformer.py    | 19 +---
 .../oneformer/configuration_oneformer.py      | 19 +---
 .../configuration_table_transformer.py        | 13 +--
 .../models/tvp/configuration_tvp.py           | 16 +---
 .../models/upernet/configuration_upernet.py   | 16 +---
 .../vit_hybrid/configuration_vit_hybrid.py    | 16 +---
 .../models/vitmatte/configuration_vitmatte.py | 16 +---
 src/transformers/utils/backbone_utils.py      | 53 ++++-------
 tests/utils/test_backbone_utils.py            | 90 +------------------
 utils/check_config_attributes.py              |  5 --
 16 files changed, 53 insertions(+), 304 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 945e5edb32ad30..163865f7332343 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -94,10 +94,8 @@ class ConditionalDetrConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -181,11 +179,6 @@ def __init__(
         focal_alpha=0.25,
         **kwargs,
     ):
-        if not use_timm_backbone and use_pretrained_backbone:
-            raise ValueError(
-                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
-            )
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 6d32f6220df586..6a9cb70b4221e7 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -87,10 +87,8 @@ class DeformableDetrConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -198,11 +196,6 @@ def __init__(
         disable_custom_kernels=False,
         **kwargs,
     ):
-        if not use_timm_backbone and use_pretrained_backbone:
-            raise ValueError(
-                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
-            )
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 1604bc56e6396d..124fe9d94d2b59 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -44,13 +44,8 @@ class DetaConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -154,8 +149,6 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         num_queries=900,
         max_position_embeddings=2048,
         encoder_layers=6,
@@ -196,9 +189,6 @@ def __init__(
         disable_custom_kernels=True,
         **kwargs,
     ):
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -217,8 +207,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 9b9b5afacd0b7f..56fb76a27728b7 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -94,11 +94,9 @@ class DetrConfig(PretrainedConfig):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `True`):
-            Whether to use pretrained weights for the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -179,11 +177,6 @@ def __init__(
         eos_coefficient=0.1,
         **kwargs,
     ):
-        if not use_timm_backbone and use_pretrained_backbone:
-            raise ValueError(
-                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
-            )
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 9bdc8d1ef0affb..3b533b07b585e0 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -114,13 +114,8 @@ class DPTConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
 
     Example:
 
@@ -173,8 +168,6 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -182,8 +175,8 @@ def __init__(
         self.hidden_size = hidden_size
         self.is_hybrid = is_hybrid
 
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
         use_autobackbone = False
         if self.is_hybrid:
@@ -229,16 +222,8 @@ def __init__(
             self.backbone_featmap_shape = None
             self.neck_ignore_stages = []
 
-        if use_autobackbone and backbone_config is not None and backbone is not None:
-            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
-
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
         self.num_attention_heads = None if use_autobackbone else num_attention_heads
         self.intermediate_size = None if use_autobackbone else intermediate_size
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index f0d13b8e030ed1..92d472591cf175 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -46,13 +46,8 @@ class Mask2FormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -160,15 +155,10 @@ def __init__(
         use_auxiliary_loss: bool = True,
         feature_strides: List[int] = [4, 8, 16, 32],
         output_auxiliary_logits: bool = None,
-        backbone: Optional[str] = None,
-        use_pretrained_backbone: bool = False,
-        use_timm_backbone: bool = False,
-        backbone_kwargs: Optional[Dict] = None,
+        backbone=None,
+        use_pretrained_backbone=False,
         **kwargs,
     ):
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -233,8 +223,6 @@ def __init__(
         self.num_hidden_layers = decoder_layers
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 653350ca056dda..e00a622ad2882b 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -56,13 +56,8 @@ class MaskFormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -122,19 +117,11 @@ def __init__(
         output_auxiliary_logits: Optional[bool] = None,
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
-        use_timm_backbone: bool = False,
-        backbone_kwargs: Optional[Dict] = None,
         **kwargs,
     ):
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
         if backbone_config is None and backbone is None:
             # fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k
             backbone_config = SwinConfig(
@@ -198,8 +185,6 @@ def __init__(
         self.num_hidden_layers = self.decoder_config.num_hidden_layers
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         super().__init__(**kwargs)
 
     @classmethod
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index 1cbd2ab7dbc18f..b8f837462c1c31 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -45,13 +45,8 @@ class OneFormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -154,8 +149,6 @@ def __init__(
         backbone_config: Optional[Dict] = None,
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
-        use_timm_backbone: bool = False,
-        backbone_kwargs: Optional[Dict] = None,
         ignore_value: int = 255,
         num_queries: int = 150,
         no_object_weight: int = 0.1,
@@ -198,9 +191,6 @@ def __init__(
         common_stride: int = 4,
         **kwargs,
     ):
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -223,14 +213,9 @@ def __init__(
             config_class = CONFIG_MAPPING[backbone_model_type]
             backbone_config = config_class.from_dict(backbone_config)
 
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.ignore_value = ignore_value
         self.num_queries = num_queries
         self.no_object_weight = no_object_weight
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 9a2ff6bbab3b24..42457cd63f8d6e 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -93,11 +93,9 @@ class TableTransformerConfig(PretrainedConfig):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `True`):
-            Whether to use pretrained weights for the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -179,11 +177,6 @@ def __init__(
         eos_coefficient=0.1,
         **kwargs,
     ):
-        if not use_timm_backbone and use_pretrained_backbone:
-            raise ValueError(
-                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
-            )
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 85b7ac6a41cbcc..ccdc54ae07747b 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -46,13 +46,8 @@ class TvpConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -107,8 +102,6 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         distance_loss_weight=1.0,
         duration_loss_weight=0.1,
         visual_prompter_type="framepad",
@@ -132,9 +125,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -152,8 +142,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 609818c80d17b7..664ef47a5bccf9 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -41,13 +41,8 @@ class UperNetConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -89,8 +84,6 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         hidden_size=512,
         initializer_range=0.02,
         pool_scales=[1, 2, 3, 6],
@@ -104,9 +97,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -124,8 +114,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 8a8a808ec60d05..2b9b0074ac190a 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -45,13 +45,8 @@ class ViTHybridConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -104,8 +99,6 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -123,9 +116,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -156,8 +146,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 275640d1d079a1..9a42a336d14370 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -46,13 +46,8 @@ class VitMatteConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
+            when this is `False`.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -86,8 +81,6 @@ def __init__(
         backbone_config: PretrainedConfig = None,
         backbone=None,
         use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         hidden_size: int = 384,
         batch_norm_eps: float = 1e-5,
         initializer_range: float = 0.02,
@@ -97,9 +90,6 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        if use_pretrained_backbone:
-            raise ValueError("Pretrained backbones are not supported yet.")
-
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
@@ -117,8 +107,6 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.batch_norm_eps = batch_norm_eps
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 14fcfe4a50a2d2..e7ae99d40e2d0a 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -304,47 +304,28 @@ def load_backbone(config):
     use_timm_backbone = getattr(config, "use_timm_backbone", None)
     use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None)
     backbone_checkpoint = getattr(config, "backbone", None)
-    backbone_kwargs = getattr(config, "backbone_kwargs", None)
-
-    backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
-
-    if backbone_kwargs and backbone_config is not None:
-        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
     # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
     # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
     if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None:
         raise ValueError("Cannot specify both config.backbone_config and config.backbone")
 
-    # If any of thhe following are set, then the config passed in is from a model which contains a backbone.
-    if (
-        backbone_config is None
-        and use_timm_backbone is None
-        and backbone_checkpoint is None
-        and backbone_checkpoint is None
-    ):
-        return AutoBackbone.from_config(config=config, **backbone_kwargs)
-
-    # config from the parent model that has a backbone
-    if use_timm_backbone:
-        if backbone_checkpoint is None:
-            raise ValueError("config.backbone must be set if use_timm_backbone is True")
-        # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
-        # to determine whether to load the pretrained weights.
-        backbone = AutoBackbone.from_pretrained(
-            backbone_checkpoint,
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            **backbone_kwargs,
-        )
-    elif use_pretrained_backbone:
-        if backbone_checkpoint is None:
-            raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
-        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **backbone_kwargs)
+    if backbone_config is not None or use_timm_backbone or backbone_checkpoint is not None:
+        # This is a config from the parent model the has a backbone. This contains the pretrained backbone checkpoint
+        # if specified.
+        # By default, most models don't have use_pretrained_backbone set.
+        if use_pretrained_backbone:
+            if backbone_checkpoint is None:
+                raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
+            backbone = AutoBackbone.from_pretrained(
+                backbone_checkpoint,
+                use_timm_backbone=getattr(config, "use_timm_backbone", False),
+            )
+        else:
+            if backbone_config is None:
+                backbone_config = AutoConfig.from_pretrained(backbone_checkpoint)
+            backbone = AutoBackbone.from_config(config=backbone_config)
     else:
-        if backbone_config is None and backbone_checkpoint is None:
-            raise ValueError("Either config.backbone_config or config.backbone must be set")
-        if backbone_config is None:
-            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **backbone_kwargs)
-        backbone = AutoBackbone.from_config(config=backbone_config)
+        # This is a backbone config, so we just initialize the backbone model with random weights directly.
+        backbone = AutoBackbone.from_config(config=config)
     return backbone
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 244f62950f048f..b289f6ada6d992 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from transformers import DetrConfig, MaskFormerConfig, ResNetBackbone, ResNetConfig, TimmBackbone
+from transformers import MaskFormerConfig
 from transformers.testing_utils import require_torch, slow
 from transformers.utils.backbone_utils import (
     BackboneMixin,
@@ -30,7 +30,8 @@
 if is_torch_available():
     import torch
 
-    from transformers import BertPreTrainedModel
+    # from transformers import PretrainedModel
+    from transformers import BertPreTrainedModel as PretrainedModel
 
 
 class BackboneUtilsTester(unittest.TestCase):
@@ -137,65 +138,6 @@ def test_backbone_mixin(self):
         self.assertEqual(backbone.out_features, ["a", "c"])
         self.assertEqual(backbone.out_indices, [-3, -1])
 
-    @slow
-    @require_torch
-    def test_load_backbone_from_config(self):
-        """
-        Test that load_backbone correctly loads a backbone from a backbone config.
-        """
-        config = MaskFormerConfig(backbone_config=ResNetConfig(out_indices=(0, 2)))
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_features, ["stem", "stage2"])
-        self.assertEqual(backbone.out_indices, (0, 2))
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-    @slow
-    @require_torch
-    def test_load_backbone_from_checkpoint(self):
-        """
-        Test that load_backbone correctly loads a backbone from a checkpoint.
-        """
-        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_config=None)
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, [4])
-        self.assertEqual(backbone.out_features, ["stage4"])
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-        config = MaskFormerConfig(
-            backbone="resnet18",
-            use_timm_backbone=True,
-        )
-        backbone = load_backbone(config)
-        # We can't know ahead of time the exact output features and indices, or the layer names before
-        # creating the timm model, so it defalts to the last layer (-1,) and has a different layer name
-        self.assertEqual(backbone.out_indices, (-1,))
-        self.assertEqual(backbone.out_features, ["layer4"])
-        self.assertIsInstance(backbone, TimmBackbone)
-
-    @slow
-    @require_torch
-    def test_load_backbone_backbone_kwargs(self):
-        """
-        Test that load_backbone correctly configures the loaded backbone with the provided kwargs.
-        """
-        config = MaskFormerConfig(backbone="resnet18", use_timm_backbone=True, backbone_kwargs={"out_indices": (0, 1)})
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, (0, 1))
-        self.assertIsInstance(backbone, TimmBackbone)
-
-        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_kwargs={"out_indices": (0, 2)})
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, (0, 2))
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-        # Check can't be passed with a backone config
-        with pytest.raises(ValueError):
-            config = MaskFormerConfig(
-                backbone="microsoft/resnet-18",
-                backbone_config=ResNetConfig(out_indices=(0, 2)),
-                backbone_kwargs={"out_indices": (0, 1)},
-            )
-
     @slow
     @require_torch
     def test_load_backbone_in_new_model(self):
@@ -203,8 +145,7 @@ def test_load_backbone_in_new_model(self):
         Tests that new model can be created, with its weights instantiated and pretrained backbone weights loaded.
         """
 
-        # Inherit from PreTrainedModel to ensure that the weights are initialized
-        class NewModel(BertPreTrainedModel):
+        class NewModel(PretrainedModel):
             def __init__(self, config):
                 super().__init__(config)
                 self.backbone = load_backbone(config)
@@ -244,26 +185,3 @@ def get_equal_not_equal_weights(model_0, model_1):
         self.assertEqual(len(equal_weights), 20)
         # Linear layers are still initialized randomly
         self.assertEqual(len(not_equal_weights), 4)
-
-        # Check loading in timm backbone
-        config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True)
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
-        self.assertEqual(len(equal_weights), 0)
-        self.assertEqual(len(not_equal_weights), 24)
-
-        # Now we create a new model with backbone weights that are pretrained
-        config.use_pretrained_backbone = True
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
-        self.assertEqual(len(equal_weights), 20)
-        # Linear layers are still initialized randomly
-        self.assertEqual(len(not_equal_weights), 4)
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f631c59b75d40e..cf4348fb530388 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -234,12 +234,7 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "out_features",
         "out_indices",
         "sampling_rate",
-        # backbone related arguments passed to load_backbone
         "use_pretrained_backbone",
-        "backbone",
-        "backbone_config",
-        "use_timm_backbone",
-        "backbone_kwargs",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 

From 589007d53f77f45778159b2b128d172cc1c7ba2d Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 2 Jan 2024 17:33:32 +0000
Subject: [PATCH 09/32] Update tests so backbone checkpoint isn't passed in

---
 tests/models/conditional_detr/test_modeling_conditional_detr.py | 1 -
 tests/models/deformable_detr/test_modeling_deformable_detr.py   | 2 +-
 tests/models/detr/test_modeling_detr.py                         | 1 -
 .../models/table_transformer/test_modeling_table_transformer.py | 1 -
 4 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index d1152ed8622b9c..23424ba5e9f4ab 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -135,7 +135,6 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
-            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 7a83c4f1ed80a8..9ff1f5c24b1173 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -152,7 +152,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone=None,
             backbone_config=resnet_config,
-            use_pretrained_backbone=False,
+            backbone=None,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 59b071e031aa8a..bc8b96b2cf2fb0 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -131,7 +131,6 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
-            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 79da1d191063ab..361a916efdd8fb 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -132,7 +132,6 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
-            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):

From e5930076544d7448e5896fed0155b8fc9f6621e7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 3 Jan 2024 20:40:24 +0000
Subject: [PATCH 10/32] Clarify pretrained import

---
 tests/utils/test_backbone_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index b289f6ada6d992..3326d12651d8d3 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -30,8 +30,7 @@
 if is_torch_available():
     import torch
 
-    # from transformers import PretrainedModel
-    from transformers import BertPreTrainedModel as PretrainedModel
+    from transformers import BertPreTrainedModel
 
 
 class BackboneUtilsTester(unittest.TestCase):
@@ -145,7 +144,8 @@ def test_load_backbone_in_new_model(self):
         Tests that new model can be created, with its weights instantiated and pretrained backbone weights loaded.
         """
 
-        class NewModel(PretrainedModel):
+        # Inherit from PreTrainedModel to ensure that the weights are initialized
+        class NewModel(BertPreTrainedModel):
             def __init__(self, config):
                 super().__init__(config)
                 self.backbone = load_backbone(config)

From 4a601dc727430f06caa2979f02bf56b4b800b0ce Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:25:16 +0000
Subject: [PATCH 11/32] Update configs - docs and validation check

---
 .../conditional_detr/configuration_conditional_detr.py |  8 ++++++--
 .../deformable_detr/configuration_deformable_detr.py   |  8 ++++++--
 src/transformers/models/deta/configuration_deta.py     |  6 ++++--
 src/transformers/models/detr/configuration_detr.py     | 10 +++++++---
 src/transformers/models/dpt/configuration_dpt.py       |  6 ++++--
 .../models/mask2former/configuration_mask2former.py    |  6 ++++--
 .../models/maskformer/configuration_maskformer.py      |  6 ++++--
 .../models/oneformer/configuration_oneformer.py        |  6 ++++--
 .../configuration_table_transformer.py                 | 10 +++++++---
 src/transformers/models/tvp/configuration_tvp.py       |  6 ++++--
 .../models/upernet/configuration_upernet.py            |  6 ++++--
 .../models/vit_hybrid/configuration_vit_hybrid.py      |  6 ++++--
 .../models/vitmatte/configuration_vitmatte.py          |  6 ++++--
 13 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 163865f7332343..8224388e001841 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -94,8 +94,7 @@ class ConditionalDetrConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -179,6 +178,11 @@ def __init__(
         focal_alpha=0.25,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+            )
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 6a9cb70b4221e7..24fa0b4326f8c0 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -87,8 +87,7 @@ class DeformableDetrConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -196,6 +195,11 @@ def __init__(
         disable_custom_kernels=False,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+            )
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 124fe9d94d2b59..83bda6d8183650 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -44,8 +44,7 @@ class DetaConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -189,6 +188,9 @@ def __init__(
         disable_custom_kernels=True,
         **kwargs,
     ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 56fb76a27728b7..9767323a05fa67 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -94,9 +94,8 @@ class DetrConfig(PretrainedConfig):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -177,6 +176,11 @@ def __init__(
         eos_coefficient=0.1,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+            )
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 3b533b07b585e0..7771c3a0d399c9 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -114,8 +114,7 @@ class DPTConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
 
     Example:
 
@@ -175,6 +174,9 @@ def __init__(
         self.hidden_size = hidden_size
         self.is_hybrid = is_hybrid
 
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 92d472591cf175..9e278d8994cd0a 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -46,8 +46,7 @@ class Mask2FormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -159,6 +158,9 @@ def __init__(
         use_pretrained_backbone=False,
         **kwargs,
     ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index e00a622ad2882b..e0ba4d4062b341 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -56,8 +56,7 @@ class MaskFormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -119,6 +118,9 @@ def __init__(
         use_pretrained_backbone: bool = False,
         **kwargs,
     ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index b8f837462c1c31..d249a0f8337502 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -45,8 +45,7 @@ class OneFormerConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -191,6 +190,9 @@ def __init__(
         common_stride: int = 4,
         **kwargs,
     ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 42457cd63f8d6e..363bf28122bd43 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -93,9 +93,8 @@ class TableTransformerConfig(PretrainedConfig):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -177,6 +176,11 @@ def __init__(
         eos_coefficient=0.1,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+            )
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index ccdc54ae07747b..23d8b05c117b96 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -46,8 +46,7 @@ class TvpConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -125,6 +124,9 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 664ef47a5bccf9..2d54c821b8c8d5 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -41,8 +41,7 @@ class UperNetConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -97,6 +96,9 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 2b9b0074ac190a..71cd4b2b3da247 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -45,8 +45,7 @@ class ViTHybridConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -116,6 +115,9 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 9a42a336d14370..b90a0f29e74798 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -46,8 +46,7 @@ class VitMatteConfig(PretrainedConfig):
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone. You cannot specify both `backbone` and `backbone_config`
-            when this is `False`.
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -90,6 +89,9 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 

From 4506d08783841f7f3f62288a954f74d2bf772d21 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:25:34 +0000
Subject: [PATCH 12/32] Update src/transformers/utils/backbone_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/utils/backbone_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index e7ae99d40e2d0a..566d19ff26190c 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -311,7 +311,7 @@ def load_backbone(config):
         raise ValueError("Cannot specify both config.backbone_config and config.backbone")
 
     if backbone_config is not None or use_timm_backbone or backbone_checkpoint is not None:
-        # This is a config from the parent model the has a backbone. This contains the pretrained backbone checkpoint
+        # This is a config from the parent model that has a backbone. This contains the pretrained backbone checkpoint
         # if specified.
         # By default, most models don't have use_pretrained_backbone set.
         if use_pretrained_backbone:

From cbc0f6ab443c0f16b6cf8afffbe20f50c2bdd8b9 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:50:37 +0000
Subject: [PATCH 13/32] Clarify exception message

---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 .../models/deformable_detr/configuration_deformable_detr.py     | 2 +-
 src/transformers/models/detr/configuration_detr.py              | 2 +-
 .../models/table_transformer/configuration_table_transformer.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 8224388e001841..06cb33bab74645 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -180,7 +180,7 @@ def __init__(
     ):
         if not use_timm_backbone and use_pretrained_backbone:
             raise ValueError(
-                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
             )
 
         if backbone_config is not None and backbone is not None:
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 24fa0b4326f8c0..994b7b77e5c819 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -197,7 +197,7 @@ def __init__(
     ):
         if not use_timm_backbone and use_pretrained_backbone:
             raise ValueError(
-                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
             )
 
         if backbone_config is not None and backbone is not None:
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 9767323a05fa67..b6834cd940e041 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -178,7 +178,7 @@ def __init__(
     ):
         if not use_timm_backbone and use_pretrained_backbone:
             raise ValueError(
-                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
             )
 
         if backbone_config is not None and backbone is not None:
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 363bf28122bd43..613863206bd416 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -178,7 +178,7 @@ def __init__(
     ):
         if not use_timm_backbone and use_pretrained_backbone:
             raise ValueError(
-                "It is not possible yet to use pretrained weights without `use_timm_backbone` set to `True`."
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
             )
 
         if backbone_config is not None and backbone is not None:

From 89804ae6ff2eb36dd11772a5fb9192e559aed15a Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 11:07:15 +0000
Subject: [PATCH 14/32] Update config init in tests

---
 tests/models/conditional_detr/test_modeling_conditional_detr.py  | 1 +
 tests/models/deformable_detr/test_modeling_deformable_detr.py    | 1 +
 tests/models/detr/test_modeling_detr.py                          | 1 +
 .../models/table_transformer/test_modeling_table_transformer.py  | 1 +
 4 files changed, 4 insertions(+)

diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 23424ba5e9f4ab..d1152ed8622b9c 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -135,6 +135,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 9ff1f5c24b1173..8487b5dc694d54 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -153,6 +153,7 @@ def get_config(self):
             backbone=None,
             backbone_config=resnet_config,
             backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index bc8b96b2cf2fb0..59b071e031aa8a 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -131,6 +131,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 361a916efdd8fb..79da1d191063ab 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -132,6 +132,7 @@ def get_config(self):
             use_timm_backbone=False,
             backbone_config=resnet_config,
             backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):

From 6300aef748a9a034ac4249240370535efb3c34a3 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:28:50 +0000
Subject: [PATCH 15/32] Add test for when use_timm_backbone=True

---
 src/transformers/utils/backbone_utils.py | 44 +++++++++++++++---------
 tests/utils/test_backbone_utils.py       | 25 +++++++++++++-
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 566d19ff26190c..22c35c3f9b6e06 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -310,22 +310,32 @@ def load_backbone(config):
     if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None:
         raise ValueError("Cannot specify both config.backbone_config and config.backbone")
 
-    if backbone_config is not None or use_timm_backbone or backbone_checkpoint is not None:
-        # This is a config from the parent model that has a backbone. This contains the pretrained backbone checkpoint
-        # if specified.
-        # By default, most models don't have use_pretrained_backbone set.
-        if use_pretrained_backbone:
-            if backbone_checkpoint is None:
-                raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
-            backbone = AutoBackbone.from_pretrained(
-                backbone_checkpoint,
-                use_timm_backbone=getattr(config, "use_timm_backbone", False),
-            )
-        else:
-            if backbone_config is None:
-                backbone_config = AutoConfig.from_pretrained(backbone_checkpoint)
-            backbone = AutoBackbone.from_config(config=backbone_config)
+    # If any of thhe following are set, then the config passed in is from a model which contains a backbone.
+    if (
+        backbone_config is None
+        and use_timm_backbone is None
+        and backbone_checkpoint is None
+        and backbone_checkpoint is None
+    ):
+        return AutoBackbone.from_config(config=config)
+
+    # config from the parent model that has a backbone
+    if use_timm_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_timm_backbone is True")
+        # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
+        # to determine whether to load the pretrained weights.
+        backbone = AutoBackbone.from_pretrained(
+            backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone
+        )
+    elif use_pretrained_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
+        backbone = AutoBackbone.from_pretrained(backbone_checkpoint)
     else:
-        # This is a backbone config, so we just initialize the backbone model with random weights directly.
-        backbone = AutoBackbone.from_config(config=config)
+        if backbone_config is None and backbone_checkpoint is None:
+            raise ValueError("Either config.backbone_config or config.backbone must be set")
+        if backbone_config is None:
+            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint)
+        backbone = AutoBackbone.from_config(config=backbone_config)
     return backbone
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 3326d12651d8d3..0c3ff4866e8379 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from transformers import MaskFormerConfig
+from transformers import DetrConfig, MaskFormerConfig
 from transformers.testing_utils import require_torch, slow
 from transformers.utils.backbone_utils import (
     BackboneMixin,
@@ -185,3 +185,26 @@ def get_equal_not_equal_weights(model_0, model_1):
         self.assertEqual(len(equal_weights), 20)
         # Linear layers are still initialized randomly
         self.assertEqual(len(not_equal_weights), 4)
+
+        # Check loading in timm backbone
+        config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True)
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
+        self.assertEqual(len(equal_weights), 0)
+        self.assertEqual(len(not_equal_weights), 24)
+
+        # Now we create a new model with backbone weights that are pretrained
+        config.use_pretrained_backbone = True
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
+        self.assertEqual(len(equal_weights), 20)
+        # Linear layers are still initialized randomly
+        self.assertEqual(len(not_equal_weights), 4)

From 6870317a7b1d4a1eeee501f55b8975e2c1aae6f2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 15:17:26 +0000
Subject: [PATCH 16/32] Use load_backbone instead

---
 src/transformers/models/dpt/modeling_dpt.py | 6 +++---
 utils/check_config_attributes.py            | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index aad3330279f051..a3899f86ff8457 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1075,10 +1075,10 @@ def __init__(self, config):
         super().__init__(config)
 
         self.backbone = None
-        if config.backbone_config is not None and config.is_hybrid is False:
-            self.backbone = load_backbone(config)
-        else:
+        if config.is_hybrid:
             self.dpt = DPTModel(config, add_pooling_layer=False)
+        else:
+            self.backbone = load_backbone(config)
 
         # Neck
         self.neck = DPTNeck(config)
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index cf4348fb530388..f97fd2ef31f5df 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -235,6 +235,8 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "out_indices",
         "sampling_rate",
         "use_pretrained_backbone",
+        "backbone",
+        "backbone_config",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 

From aa3376c50ec1fa34d73b98bf6e95237ea71f8429 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 18:23:38 +0000
Subject: [PATCH 17/32] Add use_timm_backbone to the model configs

---
 src/transformers/models/deta/configuration_deta.py           | 5 +++++
 src/transformers/models/dpt/configuration_dpt.py             | 5 +++++
 .../models/mask2former/configuration_mask2former.py          | 5 +++++
 .../models/maskformer/configuration_maskformer.py            | 5 +++++
 src/transformers/models/oneformer/configuration_oneformer.py | 5 +++++
 src/transformers/models/tvp/configuration_tvp.py             | 4 ++++
 src/transformers/models/upernet/configuration_upernet.py     | 5 +++++
 .../models/vit_hybrid/configuration_vit_hybrid.py            | 5 +++++
 src/transformers/models/vitmatte/configuration_vitmatte.py   | 5 +++++
 utils/check_config_attributes.py                             | 2 ++
 10 files changed, 46 insertions(+)

diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 83bda6d8183650..4dccce1c82d6e1 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -45,6 +45,9 @@ class DetaConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -148,6 +151,7 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         num_queries=900,
         max_position_embeddings=2048,
         encoder_layers=6,
@@ -209,6 +213,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 7771c3a0d399c9..4d6ac65767c8ce 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -115,6 +115,9 @@ class DPTConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
 
     Example:
 
@@ -167,6 +170,7 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -226,6 +230,7 @@ def __init__(
 
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
         self.num_attention_heads = None if use_autobackbone else num_attention_heads
         self.intermediate_size = None if use_autobackbone else intermediate_size
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 9e278d8994cd0a..a6255df0e1a868 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -47,6 +47,9 @@ class Mask2FormerConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -156,6 +159,7 @@ def __init__(
         output_auxiliary_logits: bool = None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         **kwargs,
     ):
         if use_pretrained_backbone:
@@ -225,6 +229,7 @@ def __init__(
         self.num_hidden_layers = decoder_layers
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index e0ba4d4062b341..85034876f744d9 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -57,6 +57,9 @@ class MaskFormerConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -116,6 +119,7 @@ def __init__(
         output_auxiliary_logits: Optional[bool] = None,
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
         **kwargs,
     ):
         if use_pretrained_backbone:
@@ -187,6 +191,7 @@ def __init__(
         self.num_hidden_layers = self.decoder_config.num_hidden_layers
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         super().__init__(**kwargs)
 
     @classmethod
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index d249a0f8337502..a6e5cd2a9d1c2a 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -46,6 +46,9 @@ class OneFormerConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -148,6 +151,7 @@ def __init__(
         backbone_config: Optional[Dict] = None,
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
         ignore_value: int = 255,
         num_queries: int = 150,
         no_object_weight: int = 0.1,
@@ -218,6 +222,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.ignore_value = ignore_value
         self.num_queries = num_queries
         self.no_object_weight = no_object_weight
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 23d8b05c117b96..ef4f2c840bdd23 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -47,6 +47,9 @@ class TvpConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -144,6 +147,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 2d54c821b8c8d5..7c5813a5f2b5e4 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -42,6 +42,9 @@ class UperNetConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -83,6 +86,7 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         hidden_size=512,
         initializer_range=0.02,
         pool_scales=[1, 2, 3, 6],
@@ -116,6 +120,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 71cd4b2b3da247..45d2e072286415 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -46,6 +46,9 @@ class ViTHybridConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -98,6 +101,7 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -148,6 +152,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index b90a0f29e74798..a87aaeb5340aa5 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -47,6 +47,9 @@ class VitMatteConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -80,6 +83,7 @@ def __init__(
         backbone_config: PretrainedConfig = None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
         hidden_size: int = 384,
         batch_norm_eps: float = 1e-5,
         initializer_range: float = 0.02,
@@ -109,6 +113,7 @@ def __init__(
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
         self.batch_norm_eps = batch_norm_eps
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f97fd2ef31f5df..a7fb1f0380d9bd 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -234,9 +234,11 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "out_features",
         "out_indices",
         "sampling_rate",
+        # backbone related arguments passed to load_backbone
         "use_pretrained_backbone",
         "backbone",
         "backbone_config",
+        "use_timm_backbone"
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 

From 737fd0ca3d0a26a880046d130263a3e9d3a0ddac Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 18:38:58 +0000
Subject: [PATCH 18/32] Add backbone_kwargs to config

---
 .../conditional_detr/configuration_conditional_detr.py |  4 +++-
 .../deformable_detr/configuration_deformable_detr.py   |  4 +++-
 src/transformers/models/deta/configuration_deta.py     |  4 ++++
 src/transformers/models/detr/configuration_detr.py     |  4 +++-
 src/transformers/models/dpt/configuration_dpt.py       |  4 ++++
 .../models/mask2former/configuration_mask2former.py    | 10 +++++++---
 .../models/maskformer/configuration_maskformer.py      |  4 ++++
 .../models/oneformer/configuration_oneformer.py        |  4 ++++
 .../configuration_table_transformer.py                 |  4 +++-
 src/transformers/models/tvp/configuration_tvp.py       |  5 +++++
 .../models/upernet/configuration_upernet.py            |  4 ++++
 .../models/vit_hybrid/configuration_vit_hybrid.py      |  4 ++++
 .../models/vitmatte/configuration_vitmatte.py          |  4 ++++
 src/transformers/utils/backbone_utils.py               | 10 +++++-----
 14 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 06cb33bab74645..1630f75bc2406e 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -95,6 +95,8 @@ class ConditionalDetrConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
             Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -225,7 +227,7 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 994b7b77e5c819..745cdcca6e5da8 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -88,6 +88,8 @@ class DeformableDetrConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
             Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -240,7 +242,7 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.dilation = dilation
         # deformable attributes
         self.num_feature_levels = num_feature_levels
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 4dccce1c82d6e1..a30e496ee50612 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -48,6 +48,8 @@ class DetaConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -152,6 +154,7 @@ def __init__(
         backbone=None,
         use_pretrained_backbone=False,
         use_timm_backbone=False,
+        backbone_kwargs=None,
         num_queries=900,
         max_position_embeddings=2048,
         encoder_layers=6,
@@ -214,6 +217,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index b6834cd940e041..b72d372e117c48 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -96,6 +96,8 @@ class DetrConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `True`):
             Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -225,7 +227,7 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 4d6ac65767c8ce..0292cbc24d9467 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -118,6 +118,8 @@ class DPTConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
 
     Example:
 
@@ -171,6 +173,7 @@ def __init__(
         backbone=None,
         use_pretrained_backbone=False,
         use_timm_backbone=False,
+        backbone_kwargs=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -231,6 +234,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
         self.num_attention_heads = None if use_autobackbone else num_attention_heads
         self.intermediate_size = None if use_autobackbone else intermediate_size
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index a6255df0e1a868..6de55ade8e55e2 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -50,6 +50,8 @@ class Mask2FormerConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -157,9 +159,10 @@ def __init__(
         use_auxiliary_loss: bool = True,
         feature_strides: List[int] = [4, 8, 16, 32],
         output_auxiliary_logits: bool = None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
+        backbone: Optional[str] = None,
+        use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
+        backbone_kwargs: Optional[Dict] = None,
         **kwargs,
     ):
         if use_pretrained_backbone:
@@ -230,6 +233,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 85034876f744d9..535b0fa9eef013 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -60,6 +60,8 @@ class MaskFormerConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -120,6 +122,7 @@ def __init__(
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
         use_timm_backbone: bool = False,
+        backbone_kwargs: Optional[Dict] = None,
         **kwargs,
     ):
         if use_pretrained_backbone:
@@ -192,6 +195,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         super().__init__(**kwargs)
 
     @classmethod
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index a6e5cd2a9d1c2a..baf617b4e87da8 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -49,6 +49,8 @@ class OneFormerConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -152,6 +154,7 @@ def __init__(
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
         use_timm_backbone: bool = False,
+        backbone_kwargs: Optional[Dict] = None,
         ignore_value: int = 255,
         num_queries: int = 150,
         no_object_weight: int = 0.1,
@@ -223,6 +226,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.ignore_value = ignore_value
         self.num_queries = num_queries
         self.no_object_weight = no_object_weight
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 613863206bd416..3f2f0cadb81581 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -95,6 +95,8 @@ class TableTransformerConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, `True`):
             Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -225,7 +227,7 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index ef4f2c840bdd23..79a406e75a84b5 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -50,6 +50,8 @@ class TvpConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -104,6 +106,8 @@ def __init__(
         backbone_config=None,
         backbone=None,
         use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
         distance_loss_weight=1.0,
         duration_loss_weight=0.1,
         visual_prompter_type="framepad",
@@ -148,6 +152,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 7c5813a5f2b5e4..16131c68a8e1d8 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -45,6 +45,8 @@ class UperNetConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -87,6 +89,7 @@ def __init__(
         backbone=None,
         use_pretrained_backbone=False,
         use_timm_backbone=False,
+        backbone_kwargs=None,
         hidden_size=512,
         initializer_range=0.02,
         pool_scales=[1, 2, 3, 6],
@@ -121,6 +124,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 45d2e072286415..a3240bace4d492 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -49,6 +49,8 @@ class ViTHybridConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -102,6 +104,7 @@ def __init__(
         backbone=None,
         use_pretrained_backbone=False,
         use_timm_backbone=False,
+        backbone_kwargs=None,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -153,6 +156,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index a87aaeb5340aa5..8788e689b674a6 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -50,6 +50,8 @@ class VitMatteConfig(PretrainedConfig):
         use_timm_backbone (`bool`, *optional*, `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -84,6 +86,7 @@ def __init__(
         backbone=None,
         use_pretrained_backbone=False,
         use_timm_backbone=False,
+        backbone_kwargs=None,
         hidden_size: int = 384,
         batch_norm_eps: float = 1e-5,
         initializer_range: float = 0.02,
@@ -114,6 +117,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
         self.batch_norm_eps = batch_norm_eps
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 22c35c3f9b6e06..6e4d68cbf41c82 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -288,7 +288,7 @@ def to_dict(self):
         return output
 
 
-def load_backbone(config):
+def load_backbone(config, **kwargs):
     """
     Loads the backbone model from a config object.
 
@@ -317,7 +317,7 @@ def load_backbone(config):
         and backbone_checkpoint is None
         and backbone_checkpoint is None
     ):
-        return AutoBackbone.from_config(config=config)
+        return AutoBackbone.from_config(config=config, **kwargs)
 
     # config from the parent model that has a backbone
     if use_timm_backbone:
@@ -326,16 +326,16 @@ def load_backbone(config):
         # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
         # to determine whether to load the pretrained weights.
         backbone = AutoBackbone.from_pretrained(
-            backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone
+            backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone, **kwargs
         )
     elif use_pretrained_backbone:
         if backbone_checkpoint is None:
             raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
-        backbone = AutoBackbone.from_pretrained(backbone_checkpoint)
+        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **kwargs)
     else:
         if backbone_config is None and backbone_checkpoint is None:
             raise ValueError("Either config.backbone_config or config.backbone must be set")
         if backbone_config is None:
-            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint)
+            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **kwargs)
         backbone = AutoBackbone.from_config(config=backbone_config)
     return backbone

From 62d79f7634f6069606a286e6880a7edf53d74a1f Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 20:03:15 +0000
Subject: [PATCH 19/32] Pass kwargs to constructors

---
 src/transformers/models/dpt/configuration_dpt.py  |  2 +-
 .../models/oneformer/configuration_oneformer.py   |  2 +-
 src/transformers/models/tvp/configuration_tvp.py  |  2 +-
 .../models/vit_hybrid/configuration_vit_hybrid.py |  2 +-
 .../models/vitmatte/configuration_vitmatte.py     |  2 +-
 src/transformers/utils/backbone_utils.py          | 15 +++++++++++----
 utils/check_config_attributes.py                  |  3 ++-
 7 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 0292cbc24d9467..fbab5a5b79745a 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -115,7 +115,7 @@ class DPTConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index baf617b4e87da8..f04a90bf81bebf 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -46,7 +46,7 @@ class OneFormerConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 79a406e75a84b5..872b59f539168b 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -47,7 +47,7 @@ class TvpConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index a3240bace4d492..fb8f52338c6a72 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -46,7 +46,7 @@ class ViTHybridConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 8788e689b674a6..79aecab59b49d8 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -47,7 +47,7 @@ class VitMatteConfig(PretrainedConfig):
             is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
             Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 6e4d68cbf41c82..d5823b52b479e7 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -304,6 +304,10 @@ def load_backbone(config, **kwargs):
     use_timm_backbone = getattr(config, "use_timm_backbone", None)
     use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None)
     backbone_checkpoint = getattr(config, "backbone", None)
+    backbone_kwargs = getattr(config, "backbone_kwargs", None)
+
+    backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
+    backbone_kwargs.update(kwargs)
 
     # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
     # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
@@ -317,7 +321,7 @@ def load_backbone(config, **kwargs):
         and backbone_checkpoint is None
         and backbone_checkpoint is None
     ):
-        return AutoBackbone.from_config(config=config, **kwargs)
+        return AutoBackbone.from_config(config=config, **backbone_kwargs)
 
     # config from the parent model that has a backbone
     if use_timm_backbone:
@@ -326,16 +330,19 @@ def load_backbone(config, **kwargs):
         # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
         # to determine whether to load the pretrained weights.
         backbone = AutoBackbone.from_pretrained(
-            backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone, **kwargs
+            backbone_checkpoint,
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            **backbone_kwargs,
         )
     elif use_pretrained_backbone:
         if backbone_checkpoint is None:
             raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
-        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **kwargs)
+        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **backbone_kwargs)
     else:
         if backbone_config is None and backbone_checkpoint is None:
             raise ValueError("Either config.backbone_config or config.backbone must be set")
         if backbone_config is None:
-            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **kwargs)
+            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **backbone_kwargs)
         backbone = AutoBackbone.from_config(config=backbone_config)
     return backbone
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index a7fb1f0380d9bd..f631c59b75d40e 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -238,7 +238,8 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "use_pretrained_backbone",
         "backbone",
         "backbone_config",
-        "use_timm_backbone"
+        "use_timm_backbone",
+        "backbone_kwargs",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 

From 19fd92d51e6f529b3f93faf016d43749224e675b Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Jan 2024 20:26:43 +0000
Subject: [PATCH 20/32] Draft

---
 .../configuration_conditional_detr.py         | 37 ++++++++---
 .../modeling_conditional_detr.py              | 27 +-------
 .../configuration_deformable_detr.py          | 15 ++++-
 .../modeling_deformable_detr.py               | 26 +-------
 .../models/deta/configuration_deta.py         |  5 +-
 .../models/detr/configuration_detr.py         | 37 +++++++++--
 src/transformers/models/detr/modeling_detr.py | 27 +-------
 .../models/dpt/configuration_dpt.py           | 14 +++--
 .../mask2former/configuration_mask2former.py  |  5 +-
 .../maskformer/configuration_maskformer.py    |  8 ++-
 .../oneformer/configuration_oneformer.py      |  8 ++-
 .../configuration_table_transformer.py        | 37 +++++++++--
 .../modeling_table_transformer.py             | 27 +-------
 .../timm_backbone/modeling_timm_backbone.py   |  7 ++-
 .../models/tvp/configuration_tvp.py           |  5 +-
 .../models/upernet/configuration_upernet.py   |  5 +-
 .../vit_hybrid/configuration_vit_hybrid.py    |  5 +-
 .../models/vitmatte/configuration_vitmatte.py |  5 +-
 src/transformers/utils/backbone_utils.py      |  6 +-
 .../test_modeling_deformable_detr.py          |  1 -
 tests/models/detr/test_modeling_detr.py       |  3 +
 tests/utils/test_backbone_utils.py            | 61 ++++++++++++++++++-
 utils/check_config_attributes.py              |  4 ++
 23 files changed, 230 insertions(+), 145 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 1630f75bc2406e..ef57bd77ed1759 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -96,7 +96,8 @@ class ConditionalDetrConfig(PretrainedConfig):
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
             Whether to use pretrained weights for the backbone.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -191,10 +192,14 @@ def __init__(
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
-        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
-        if not use_timm_backbone:
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -205,7 +210,7 @@ def __init__(
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self.num_channels = num_channels
+        self._num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -227,8 +232,8 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
-        self.dilation = dilation
+        self.backbone_kwargs = backbone_kwargs
+        self._dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -242,6 +247,16 @@ def __init__(
         self.focal_alpha = focal_alpha
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
+    @property
+    def num_channels(self):
+        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
+        return self._num_channels
+
+    @property
+    def dilation(self):
+        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
+        return self._dilation
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -250,6 +265,12 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_num_channels", None)
+        output.pop("_dilation", None)
+        return output
+
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index d8ff371fad77d1..b926b6df482740 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -32,7 +32,6 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
-    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -49,9 +48,6 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    from timm import create_model
-
 if is_vision_available():
     from ...image_transforms import center_to_corners_format
 
@@ -351,30 +347,13 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
+        self.intermediate_channel_sizes = self.model.channels
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -388,7 +367,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 745cdcca6e5da8..41642ec8783c94 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -89,7 +89,8 @@ class DeformableDetrConfig(PretrainedConfig):
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
             Whether to use pretrained weights for the backbone.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -211,7 +212,14 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -219,6 +227,7 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
         self.num_channels = num_channels
@@ -242,7 +251,7 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.dilation = dilation
         # deformable attributes
         self.num_feature_levels = num_feature_levels
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index c0ac7cffc7ab44..e5dfd5906a9972 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -35,7 +35,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
-    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -144,8 +143,6 @@ def backward(context, grad_output):
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    from timm import create_model
 
 logger = logging.get_logger(__name__)
 
@@ -419,30 +416,13 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
+        self.intermediate_channel_sizes = self.model.channels
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -457,7 +437,7 @@ def __init__(self, config):
     # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->DeformableDetr
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index a30e496ee50612..1604bc56e6396d 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -49,7 +49,8 @@ class DetaConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -217,7 +218,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index b72d372e117c48..9b3bd8b3bec4ef 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -97,7 +97,8 @@ class DetrConfig(PretrainedConfig):
         use_pretrained_backbone (`bool`, *optional*, `True`):
             Whether to use pretrained weights for the backbone.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -192,7 +193,14 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -200,12 +208,13 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
             # set timm attributes to None
-            dilation, backbone, use_pretrained_backbone = None, None, None
+            dilation = None
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self.num_channels = num_channels
+        self._num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -227,8 +236,8 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
-        self.dilation = dilation
+        self.backbone_kwargs = backbone_kwargs
+        self._dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -241,6 +250,16 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
+    @property
+    def num_channels(self):
+        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
+        return self._num_channels
+
+    @property
+    def dilation(self):
+        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
+        return self._dilation
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -261,6 +280,12 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
         """
         return cls(backbone_config=backbone_config, **kwargs)
 
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_num_channels", None)
+        output.pop("_dilation", None)
+        return output
+
 
 class DetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index d7fcdfc5bc7e83..b23bda7e8fa093 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -32,7 +32,6 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
-    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -49,9 +48,6 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    from timm import create_model
-
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
@@ -344,30 +340,13 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
+        self.intermediate_channel_sizes = self.model.channels
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -381,7 +360,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index fbab5a5b79745a..9bdc8d1ef0affb 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -119,7 +119,8 @@ class DPTConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
 
     Example:
 
@@ -184,9 +185,6 @@ def __init__(
         if use_pretrained_backbone:
             raise ValueError("Pretrained backbones are not supported yet.")
 
-        if backbone_config is not None and backbone is not None:
-            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
-
         use_autobackbone = False
         if self.is_hybrid:
             if backbone_config is None and backbone is None:
@@ -231,10 +229,16 @@ def __init__(
             self.backbone_featmap_shape = None
             self.neck_ignore_stages = []
 
+        if use_autobackbone and backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
         self.num_attention_heads = None if use_autobackbone else num_attention_heads
         self.intermediate_size = None if use_autobackbone else intermediate_size
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 6de55ade8e55e2..f0d13b8e030ed1 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -51,7 +51,8 @@ class Mask2FormerConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -233,7 +234,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 535b0fa9eef013..653350ca056dda 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -61,7 +61,8 @@ class MaskFormerConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -131,6 +132,9 @@ def __init__(
         if backbone_config is not None and backbone is not None:
             raise ValueError("You can't specify both `backbone` and `backbone_config`.")
 
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
         if backbone_config is None and backbone is None:
             # fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k
             backbone_config = SwinConfig(
@@ -195,7 +199,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         super().__init__(**kwargs)
 
     @classmethod
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index f04a90bf81bebf..1cbd2ab7dbc18f 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -50,7 +50,8 @@ class OneFormerConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -222,11 +223,14 @@ def __init__(
             config_class = CONFIG_MAPPING[backbone_model_type]
             backbone_config = config_class.from_dict(backbone_config)
 
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
         self.backbone_config = backbone_config
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.ignore_value = ignore_value
         self.num_queries = num_queries
         self.no_object_weight = no_object_weight
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 3f2f0cadb81581..c6777290ac02d0 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -96,7 +96,8 @@ class TableTransformerConfig(PretrainedConfig):
         use_pretrained_backbone (`bool`, *optional*, `True`):
             Whether to use pretrained weights for the backbone.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -192,7 +193,14 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        if not use_timm_backbone:
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
             if backbone_config is None:
                 logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
                 backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
@@ -200,12 +208,13 @@ def __init__(
                 backbone_model_type = backbone_config.get("model_type")
                 config_class = CONFIG_MAPPING[backbone_model_type]
                 backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
             # set timm attributes to None
-            dilation, backbone, use_pretrained_backbone = None, None, None
+            dilation = None
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self.num_channels = num_channels
+        self._num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -227,8 +236,8 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
-        self.dilation = dilation
+        self.backbone_kwargs = backbone_kwargs
+        self._dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -241,6 +250,16 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
+    @property
+    def num_channels(self):
+        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
+        return self._num_channels
+
+    @property
+    def dilation(self):
+        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
+        return self._dilation
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -249,6 +268,12 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_num_channels", None)
+        output.pop("_dilation", None)
+        return output
+
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 8e577a65a5fe00..bf622be57a6b92 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -32,7 +32,6 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
-    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -45,9 +44,6 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
-if is_timm_available():
-    from timm import create_model
-
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
@@ -278,30 +274,13 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
+        self.intermediate_channel_sizes = self.model.channels
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -315,7 +294,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
index 0c6fe67b75731f..e8e0b28e042d6f 100644
--- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py
+++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
@@ -63,12 +63,13 @@ def __init__(self, config, **kwargs):
         # We just take the final layer by default. This matches the default for the transformers models.
         out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
 
+        in_chans = kwargs.pop("in_chans", config.num_channels)
         self._backbone = timm.create_model(
             config.backbone,
             pretrained=pretrained,
             # This is currently not possible for transformer architectures.
             features_only=config.features_only,
-            in_chans=config.num_channels,
+            in_chans=in_chans,
             out_indices=out_indices,
             **kwargs,
         )
@@ -79,7 +80,9 @@ def __init__(self, config, **kwargs):
 
         # These are used to control the output of the model when called. If output_hidden_states is True, then
         # return_layers is modified to include all layers.
-        self._return_layers = self._backbone.return_layers
+        self._return_layers = {
+            layer["module"]: str(layer["index"]) for layer in self._backbone.feature_info.get_dicts()
+        }
         self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
         super()._init_backbone(config)
 
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 872b59f539168b..85b7ac6a41cbcc 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -51,7 +51,8 @@ class TvpConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -152,7 +153,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 16131c68a8e1d8..609818c80d17b7 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -46,7 +46,8 @@ class UperNetConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -124,7 +125,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index fb8f52338c6a72..8a8a808ec60d05 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -50,7 +50,8 @@ class ViTHybridConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -156,7 +157,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 79aecab59b49d8..275640d1d079a1 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -51,7 +51,8 @@ class VitMatteConfig(PretrainedConfig):
             Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
             library.
         backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to the backbone constructor e.g. `{'out_indices': (0, 1, 2, 3)}`.
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -117,7 +118,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs if backbone_kwargs is not None else {}
+        self.backbone_kwargs = backbone_kwargs
         self.batch_norm_eps = batch_norm_eps
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index d5823b52b479e7..14fcfe4a50a2d2 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -288,7 +288,7 @@ def to_dict(self):
         return output
 
 
-def load_backbone(config, **kwargs):
+def load_backbone(config):
     """
     Loads the backbone model from a config object.
 
@@ -307,7 +307,9 @@ def load_backbone(config, **kwargs):
     backbone_kwargs = getattr(config, "backbone_kwargs", None)
 
     backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
-    backbone_kwargs.update(kwargs)
+
+    if backbone_kwargs and backbone_config is not None:
+        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
     # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
     # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 8487b5dc694d54..7a83c4f1ed80a8 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -152,7 +152,6 @@ def get_config(self):
             use_timm_backbone=False,
             backbone=None,
             backbone_config=resnet_config,
-            backbone=None,
             use_pretrained_backbone=False,
         )
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 59b071e031aa8a..cce951561f3352 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -444,6 +444,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 0c3ff4866e8379..244f62950f048f 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from transformers import DetrConfig, MaskFormerConfig
+from transformers import DetrConfig, MaskFormerConfig, ResNetBackbone, ResNetConfig, TimmBackbone
 from transformers.testing_utils import require_torch, slow
 from transformers.utils.backbone_utils import (
     BackboneMixin,
@@ -137,6 +137,65 @@ def test_backbone_mixin(self):
         self.assertEqual(backbone.out_features, ["a", "c"])
         self.assertEqual(backbone.out_indices, [-3, -1])
 
+    @slow
+    @require_torch
+    def test_load_backbone_from_config(self):
+        """
+        Test that load_backbone correctly loads a backbone from a backbone config.
+        """
+        config = MaskFormerConfig(backbone_config=ResNetConfig(out_indices=(0, 2)))
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_features, ["stem", "stage2"])
+        self.assertEqual(backbone.out_indices, (0, 2))
+        self.assertIsInstance(backbone, ResNetBackbone)
+
+    @slow
+    @require_torch
+    def test_load_backbone_from_checkpoint(self):
+        """
+        Test that load_backbone correctly loads a backbone from a checkpoint.
+        """
+        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_config=None)
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, [4])
+        self.assertEqual(backbone.out_features, ["stage4"])
+        self.assertIsInstance(backbone, ResNetBackbone)
+
+        config = MaskFormerConfig(
+            backbone="resnet18",
+            use_timm_backbone=True,
+        )
+        backbone = load_backbone(config)
+        # We can't know ahead of time the exact output features and indices, or the layer names before
+        # creating the timm model, so it defalts to the last layer (-1,) and has a different layer name
+        self.assertEqual(backbone.out_indices, (-1,))
+        self.assertEqual(backbone.out_features, ["layer4"])
+        self.assertIsInstance(backbone, TimmBackbone)
+
+    @slow
+    @require_torch
+    def test_load_backbone_backbone_kwargs(self):
+        """
+        Test that load_backbone correctly configures the loaded backbone with the provided kwargs.
+        """
+        config = MaskFormerConfig(backbone="resnet18", use_timm_backbone=True, backbone_kwargs={"out_indices": (0, 1)})
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, (0, 1))
+        self.assertIsInstance(backbone, TimmBackbone)
+
+        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_kwargs={"out_indices": (0, 2)})
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, (0, 2))
+        self.assertIsInstance(backbone, ResNetBackbone)
+
+        # Check can't be passed with a backone config
+        with pytest.raises(ValueError):
+            config = MaskFormerConfig(
+                backbone="microsoft/resnet-18",
+                backbone_config=ResNetConfig(out_indices=(0, 2)),
+                backbone_kwargs={"out_indices": (0, 1)},
+            )
+
     @slow
     @require_torch
     def test_load_backbone_in_new_model(self):
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f631c59b75d40e..84abb1386bd2cf 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -132,9 +132,13 @@
 # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
 SPECIAL_CASES_TO_ALLOW.update(
     {
+        # Has no longer used attributes that will be removed after a deprecation cycle in v4.40
+        "ConditionalDetrConfig": True,
         "CLIPSegConfig": True,
         "DeformableDetrConfig": True,
         "DetaConfig": True,
+        # Has no longer used attributes that will be removed after a deprecation cycle in v4.40
+        "DetrConfig": True,
         "DinatConfig": True,
         "DonutSwinConfig": True,
         "EfficientFormerConfig": True,

From f9cbc017f9edab177acd1a3bdbf974395d3cd32c Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 19 Feb 2024 15:54:44 +0000
Subject: [PATCH 21/32] Fix tests

---
 src/transformers/models/dpt/modeling_dpt.py                   | 2 +-
 .../models/conditional_detr/test_modeling_conditional_detr.py | 2 ++
 tests/models/detr/test_modeling_detr.py                       | 1 -
 .../table_transformer/test_modeling_table_transformer.py      | 4 +++-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index a3899f86ff8457..ef6c8bb853abda 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1075,7 +1075,7 @@ def __init__(self, config):
         super().__init__(config)
 
         self.backbone = None
-        if config.is_hybrid:
+        if config.is_hybrid or config.backbone_config is None:
             self.dpt = DPTModel(config, add_pooling_layer=False)
         else:
             self.backbone = load_backbone(config)
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index d1152ed8622b9c..bc53f826ed2125 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -444,7 +444,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
         config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index cce951561f3352..e581ce1729668b 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -474,7 +474,6 @@ def test_greyscale_images(self):
         )
 
         # let's set num_channels to 1
-        config.num_channels = 1
         config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 79da1d191063ab..8f3fd1d7e8f8b7 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -456,6 +456,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -483,7 +486,6 @@ def test_greyscale_images(self):
         )
 
         # let's set num_channels to 1
-        config.num_channels = 1
         config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:

From cffd51fc79ca6d62dcafd9202f45eeb2882bebd8 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 7 Mar 2024 20:50:40 +0000
Subject: [PATCH 22/32] Add back timm - weight naming

---
 .../configuration_conditional_detr.py         | 14 ++------
 .../modeling_conditional_detr.py              | 32 ++++++++++++++++---
 .../modeling_deformable_detr.py               |  6 +++-
 .../models/detr/configuration_detr.py         | 14 ++------
 src/transformers/models/detr/modeling_detr.py | 32 +++++++++++++++++--
 .../configuration_table_transformer.py        | 14 ++------
 .../modeling_table_transformer.py             | 30 +++++++++++++++--
 tests/utils/test_backbone_utils.py            |  2 +-
 8 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ef57bd77ed1759..bad7092419506c 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -210,7 +210,7 @@ def __init__(
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self._num_channels = num_channels
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -233,7 +233,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.backbone_kwargs = backbone_kwargs
-        self._dilation = dilation
+        self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -247,16 +247,6 @@ def __init__(
         self.focal_alpha = focal_alpha
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
-    @property
-    def num_channels(self):
-        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
-        return self._num_channels
-
-    @property
-    def dilation(self):
-        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
-        return self._dilation
-
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index b926b6df482740..b634462437da72 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -32,6 +32,7 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
+    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -48,6 +49,9 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+if is_timm_available():
+    from timm.models import create_model
+
 if is_vision_available():
     from ...image_transforms import center_to_corners_format
 
@@ -335,7 +339,7 @@ def replace_batch_norm(model):
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
-class ConditionalDetrConvEncoder(nn.Module):
+class DetrConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
@@ -347,13 +351,33 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = load_backbone(config)
+
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.channels
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -367,7 +391,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values).feature_maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index e5dfd5906a9972..e146c1c8312da2 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -35,6 +35,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
+    is_timm_available,
     is_torch_cuda_available,
     is_vision_available,
     replace_return_docstrings,
@@ -91,6 +92,9 @@ def load_cuda_kernels():
     from accelerate import PartialState
     from accelerate.utils import reduce
 
+if is_timm_available():
+    pass
+
 
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
@@ -437,7 +441,7 @@ def __init__(self, config):
     # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->DeformableDetr
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values).feature_maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 9b3bd8b3bec4ef..2f3ce689e4ae0a 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -214,7 +214,7 @@ def __init__(
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self._num_channels = num_channels
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -237,7 +237,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.backbone_kwargs = backbone_kwargs
-        self._dilation = dilation
+        self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -250,16 +250,6 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
-    @property
-    def num_channels(self):
-        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
-        return self._num_channels
-
-    @property
-    def dilation(self):
-        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
-        return self._dilation
-
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index b23bda7e8fa093..8b5c47b45bda97 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -32,6 +32,7 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
+    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -48,6 +49,11 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+
+if is_timm_available():
+    from timm import create_model
+
+
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
@@ -340,13 +346,33 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = load_backbone(config)
+
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.channels
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -360,7 +386,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values).feature_maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index c6777290ac02d0..c18bbea69ee4a1 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -214,7 +214,7 @@ def __init__(
 
         self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
-        self._num_channels = num_channels
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -237,7 +237,7 @@ def __init__(
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.backbone_kwargs = backbone_kwargs
-        self._dilation = dilation
+        self.dilation = dilation
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -250,16 +250,6 @@ def __init__(
         self.eos_coefficient = eos_coefficient
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
-    @property
-    def num_channels(self):
-        logger.warn("The `num_channels` attribute is deprecated and will be removed in v4.40")
-        return self._num_channels
-
-    @property
-    def dilation(self):
-        logger.warn("The `dilation` attribute is deprecated and will be removed in v4.40")
-        return self._dilation
-
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index bf622be57a6b92..32a656302eac2b 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -32,6 +32,7 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_scipy_available,
+    is_timm_available,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -44,6 +45,9 @@
 if is_scipy_available():
     from scipy.optimize import linear_sum_assignment
 
+if is_timm_available():
+    from timm.models import create_model
+
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
@@ -274,13 +278,33 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = load_backbone(config)
+
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.channels
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
@@ -294,7 +318,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values).feature_maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 244f62950f048f..cd9a5a29a8c071 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -167,7 +167,7 @@ def test_load_backbone_from_checkpoint(self):
         )
         backbone = load_backbone(config)
         # We can't know ahead of time the exact output features and indices, or the layer names before
-        # creating the timm model, so it defalts to the last layer (-1,) and has a different layer name
+        # creating the timm model, so it defaults to the last layer (-1,) and has a different layer name
         self.assertEqual(backbone.out_indices, (-1,))
         self.assertEqual(backbone.out_features, ["layer4"])
         self.assertIsInstance(backbone, TimmBackbone)

From 5bf5329d1605b78e427643336226164a845cc47c Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:14:49 +0000
Subject: [PATCH 23/32] More tidying up

---
 .../configuration_conditional_detr.py         |  6 -----
 .../modeling_conditional_detr.py              |  6 ++---
 .../modeling_deformable_detr.py               | 25 ++++++++++++++++---
 .../models/detr/configuration_detr.py         |  6 -----
 .../configuration_table_transformer.py        |  6 -----
 .../modeling_table_transformer.py             |  2 +-
 utils/check_config_attributes.py              |  4 ---
 7 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index bad7092419506c..88c3a4d51a19d0 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -255,12 +255,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    def to_dict(self):
-        output = super().to_dict()
-        output.pop("_num_channels", None)
-        output.pop("_dilation", None)
-        return output
-
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index b634462437da72..6adc90a328cd35 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -50,7 +50,7 @@
     from scipy.optimize import linear_sum_assignment
 
 if is_timm_available():
-    from timm.models import create_model
+    from timm import create_model
 
 if is_vision_available():
     from ...image_transforms import center_to_corners_format
@@ -338,8 +338,8 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
-class DetrConvEncoder(nn.Module):
+# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr >ConditionalDetr
+class ConditionalDetrConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index e146c1c8312da2..52eeb14ebbbb87 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -93,7 +93,7 @@ def load_cuda_kernels():
     from accelerate.utils import reduce
 
 if is_timm_available():
-    pass
+    from timm import create_model
 
 
 class MultiScaleDeformableAttentionFunction(Function):
@@ -420,13 +420,32 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        backbone = load_backbone(config)
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=in_chans,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
 
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.channels
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
         backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
         if "resnet" in backbone_model_type:
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 2f3ce689e4ae0a..c5dafa8f9b7649 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -270,12 +270,6 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
         """
         return cls(backbone_config=backbone_config, **kwargs)
 
-    def to_dict(self):
-        output = super().to_dict()
-        output.pop("_num_channels", None)
-        output.pop("_dilation", None)
-        return output
-
 
 class DetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index c18bbea69ee4a1..c7f14b9d202b3b 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -258,12 +258,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    def to_dict(self):
-        output = super().to_dict()
-        output.pop("_num_channels", None)
-        output.pop("_dilation", None)
-        return output
-
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 32a656302eac2b..f06de5d8d2fbd8 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -46,7 +46,7 @@
     from scipy.optimize import linear_sum_assignment
 
 if is_timm_available():
-    from timm.models import create_model
+    from timm import create_model
 
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 84abb1386bd2cf..f631c59b75d40e 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -132,13 +132,9 @@
 # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
 SPECIAL_CASES_TO_ALLOW.update(
     {
-        # Has no longer used attributes that will be removed after a deprecation cycle in v4.40
-        "ConditionalDetrConfig": True,
         "CLIPSegConfig": True,
         "DeformableDetrConfig": True,
         "DetaConfig": True,
-        # Has no longer used attributes that will be removed after a deprecation cycle in v4.40
-        "DetrConfig": True,
         "DinatConfig": True,
         "DonutSwinConfig": True,
         "EfficientFormerConfig": True,

From 7d4e93abe654bfb8c3a0491d4952c7b98ac3cccf Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:16:29 +0000
Subject: [PATCH 24/32] Whoops

---
 .../models/conditional_detr/modeling_conditional_detr.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 6adc90a328cd35..67c3c530723f8d 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -338,7 +338,7 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
 
 
-# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr >ConditionalDetr
+# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->ConditionalDetr
 class ConditionalDetrConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.

From ac56450ff8d4041e48b30618a4f371611350f20c Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:20:05 +0000
Subject: [PATCH 25/32] Tidy up

---
 .../modeling_conditional_detr.py              |  2 +-
 .../modeling_deformable_detr.py               | 32 +++++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 67c3c530723f8d..bcc86b5bdcd44f 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -343,7 +343,7 @@ class ConditionalDetrConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
-    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+    nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above.
 
     """
 
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 52eeb14ebbbb87..5d4bf9023c07e0 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -88,14 +88,31 @@ def load_cuda_kernels():
 if is_vision_available():
     from transformers.image_transforms import center_to_corners_format
 
+
 if is_accelerate_available():
     from accelerate import PartialState
     from accelerate.utils import reduce
 
+
 if is_timm_available():
     from timm import create_model
 
 
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DeformableDetrConfig"
+_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
+
+DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "sensetime/deformable-detr",
+    # See all Deformable DETR models at https://huggingface.co/models?filter=deformable-detr
+]
+
+
 class MultiScaleDeformableAttentionFunction(Function):
     @staticmethod
     def forward(
@@ -144,19 +161,6 @@ def backward(context, grad_output):
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 
 
-if is_scipy_available():
-    from scipy.optimize import linear_sum_assignment
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "DeformableDetrConfig"
-_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
-
-
-from ..deprecated._archive_maps import DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST  # noqa: F401, E402
-
-
 @dataclass
 class DeformableDetrDecoderOutput(ModelOutput):
     """
@@ -433,7 +437,7 @@ def __init__(self, config):
                 pretrained=config.use_pretrained_backbone,
                 features_only=True,
                 out_indices=out_indices,
-                in_chans=in_chans,
+                in_chans=num_channels,
                 **kwargs,
             )
         else:

From 1c55822e8630bc7533563f6908430d9681b44185 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:09:02 +0000
Subject: [PATCH 26/32] Handle when kwargs are none

---
 .../models/conditional_detr/modeling_conditional_detr.py         | 1 +
 .../models/deformable_detr/modeling_deformable_detr.py           | 1 +
 src/transformers/models/detr/modeling_detr.py                    | 1 +
 .../models/table_transformer/modeling_table_transformer.py       | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index bcc86b5bdcd44f..f2d762addbb2ca 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -356,6 +356,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 5d4bf9023c07e0..61a5c7a1f112ad 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -428,6 +428,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs
             out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 8b5c47b45bda97..67cde1da2ddce0 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -351,6 +351,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index f06de5d8d2fbd8..a600c50c6f1b8d 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -283,6 +283,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:

From 30d3232dcc3c6d7c7764ff65b40a228c471e2042 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:38:28 +0000
Subject: [PATCH 27/32] Update tests

---
 .../models/conditional_detr/test_modeling_conditional_detr.py  | 3 +++
 tests/models/deformable_detr/test_modeling_deformable_detr.py  | 3 +++
 tests/models/detr/test_modeling_detr.py                        | 3 +++
 .../table_transformer/test_modeling_table_transformer.py       | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index bc53f826ed2125..f541d0e6dc27e7 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -465,6 +465,9 @@ def test_different_timm_backbone(self):
 
             self.assertTrue(outputs)
 
+            # Confirm out_indices was propogated to backbone
+            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 7a83c4f1ed80a8..dc3fd57a611204 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -541,6 +541,9 @@ def test_different_timm_backbone(self):
 
             self.assertTrue(outputs)
 
+            # Confirm out_indices was propogated to backbone
+            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index e581ce1729668b..63ce05ef41164f 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -465,6 +465,9 @@ def test_different_timm_backbone(self):
 
             self.assertTrue(outputs)
 
+            # Confirm out_indices was propogated to backbone
+            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
+
     def test_greyscale_images(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 8f3fd1d7e8f8b7..a1b0303793eff7 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -477,6 +477,9 @@ def test_different_timm_backbone(self):
 
             self.assertTrue(outputs)
 
+            # Confirm out_indices was propogated to backbone
+            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
+
     def test_greyscale_images(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From 818343685b07b7972f185c18165d86540ae5add2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:53:24 +0000
Subject: [PATCH 28/32] Revert test changes

---
 tests/models/detr/test_modeling_detr.py                          | 1 +
 .../models/table_transformer/test_modeling_table_transformer.py  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 63ce05ef41164f..d2a45fdf2c6433 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -477,6 +477,7 @@ def test_greyscale_images(self):
         )
 
         # let's set num_channels to 1
+        config.num_channels = 1
         config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index a1b0303793eff7..01e00531d55655 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -489,6 +489,7 @@ def test_greyscale_images(self):
         )
 
         # let's set num_channels to 1
+        config.num_channels = 1
         config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:

From 61fe67360250ac1526dc432bee3666953789de0f Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:58:15 +0000
Subject: [PATCH 29/32] Deformable detr test - don't use default

---
 .../models/deformable_detr/test_modeling_deformable_detr.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index dc3fd57a611204..1413fd0f072809 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -521,8 +521,9 @@ def test_different_timm_backbone(self):
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
-        config.use_timm_backbone = True
         config.backbone_config = None
+        config.use_timm_backbone = True
+        config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]}
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -542,7 +543,7 @@ def test_different_timm_backbone(self):
             self.assertTrue(outputs)
 
             # Confirm out_indices was propogated to backbone
-            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
+            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 4)
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 94309d9284c10f56921d534b5a75aaa0801ad98b Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 12 Mar 2024 14:06:07 +0000
Subject: [PATCH 30/32] Don't mutate; correct model attributes

---
 .../conditional_detr/modeling_conditional_detr.py     |  2 +-
 .../deformable_detr/modeling_deformable_detr.py       |  2 +-
 src/transformers/models/detr/modeling_detr.py         |  2 +-
 .../table_transformer/modeling_table_transformer.py   |  2 +-
 .../test_modeling_conditional_detr.py                 | 11 ++++++++---
 .../deformable_detr/test_modeling_deformable_detr.py  | 11 ++++++++---
 tests/models/detr/test_modeling_detr.py               | 11 ++++++++---
 .../test_modeling_table_transformer.py                |  8 +++++---
 8 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index f2d762addbb2ca..c464a41861b37a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -356,7 +356,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
-            kwargs = {} if kwargs is None else kwargs
+            kwargs = {} if kwargs is None else kwargs.copy()
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 61a5c7a1f112ad..62b9e8768da765 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -428,7 +428,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
-            kwargs = {} if kwargs is None else kwargs
+            kwargs = {} if kwargs is None else kwargs.copy()
             out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 67cde1da2ddce0..98aa634ea2fb54 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -351,7 +351,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
-            kwargs = {} if kwargs is None else kwargs
+            kwargs = {} if kwargs is None else kwargs.copy()
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index a600c50c6f1b8d..ee98521d759255 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -283,7 +283,7 @@ def __init__(self, config):
         if config.use_timm_backbone:
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
-            kwargs = {} if kwargs is None else kwargs
+            kwargs = {} if kwargs is None else kwargs.copy()
             out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
             num_channels = kwargs.pop("in_chans", config.num_channels)
             if config.dilation:
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index f541d0e6dc27e7..c3f77614b4dd31 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -462,12 +462,17 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            elif model_class.__name__ == "ConditionalDetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 
-            # Confirm out_indices was propogated to backbone
-            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 1413fd0f072809..36be099790a45b 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -539,12 +539,17 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
+            elif model_class.__name__ == "ConditionalDetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.deformable_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
 
-            # Confirm out_indices was propogated to backbone
-            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 4)
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index d2a45fdf2c6433..27092c626dd46d 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -462,12 +462,17 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            elif model_class.__name__ == "DetrForSegmentation":
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 
-            # Confirm out_indices was propogated to backbone
-            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
-
     def test_greyscale_images(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 01e00531d55655..d323083eb7f1d4 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -474,12 +474,14 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
+            else:
+                # Confirm out_indices was propogated to backbone
+                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
 
-            # Confirm out_indices was propogated to backbone
-            self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
-
     def test_greyscale_images(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From 80b32cc8a6d2a5c9cd223e518beb00db18d64201 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:59:53 +0000
Subject: [PATCH 31/32] Add some clarifying comments

---
 .../models/conditional_detr/configuration_conditional_detr.py  | 2 ++
 .../models/conditional_detr/modeling_conditional_detr.py       | 2 ++
 .../models/deformable_detr/configuration_deformable_detr.py    | 2 ++
 .../models/deformable_detr/modeling_deformable_detr.py         | 3 +++
 src/transformers/models/detr/configuration_detr.py             | 2 ++
 src/transformers/models/detr/modeling_detr.py                  | 2 ++
 .../table_transformer/configuration_table_transformer.py       | 2 ++
 .../models/table_transformer/modeling_table_transformer.py     | 2 ++
 8 files changed, 17 insertions(+)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 88c3a4d51a19d0..f89e9bc04f2807 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -192,6 +192,8 @@ def __init__(
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
+        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
             if dilation:
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index c464a41861b37a..d723d3866ea416 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -354,6 +354,8 @@ def __init__(self, config):
 
         # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
             kwargs = {} if kwargs is None else kwargs.copy()
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 41642ec8783c94..5fd6bdb82f5839 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -212,6 +212,8 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
+        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
             if dilation:
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 62b9e8768da765..7b2bbb9b1242c9 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -425,7 +425,10 @@ def __init__(self, config):
 
         self.config = config
 
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
             kwargs = {} if kwargs is None else kwargs.copy()
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index c5dafa8f9b7649..1606618fc4d8c3 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -193,6 +193,8 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
+        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
             if dilation:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 98aa634ea2fb54..0da702db8b67e2 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -349,6 +349,8 @@ def __init__(self, config):
 
         # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
             kwargs = {} if kwargs is None else kwargs.copy()
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index c7f14b9d202b3b..037f2c82929001 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -193,6 +193,8 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
+        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
             if dilation:
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index ee98521d759255..9a684ee121ddca 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -281,6 +281,8 @@ def __init__(self, config):
 
         # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
         if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
             requires_backends(self, ["timm"])
             kwargs = getattr(config, "backbone_kwargs", {})
             kwargs = {} if kwargs is None else kwargs.copy()

From 5a9799c1649a98841033895399c7e3d25202f550 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:04:06 +0000
Subject: [PATCH 32/32] nit - grammar is hard

---
 .../models/conditional_detr/configuration_conditional_detr.py   | 2 +-
 .../models/deformable_detr/configuration_deformable_detr.py     | 2 +-
 src/transformers/models/detr/configuration_detr.py              | 2 +-
 .../models/table_transformer/configuration_table_transformer.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index f89e9bc04f2807..4f95de3582f082 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -192,7 +192,7 @@ def __init__(
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
-        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
         # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 5fd6bdb82f5839..3f3ffff69ff2e9 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -212,7 +212,7 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
         # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 1606618fc4d8c3..db180ef1d41fed 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -193,7 +193,7 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
         # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 037f2c82929001..4963396024a57e 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -193,7 +193,7 @@ def __init__(
         if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
             raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
 
-        # We default to values which were previously hard-coded in the model. This enables configurability of config
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
         # while keeping the default behavior the same.
         if use_timm_backbone and backbone_kwargs is None:
             backbone_kwargs = {}