diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index d6d1041ea656ad..d783488caecc75 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -248,7 +248,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     install_steps=["uv venv && uv pip install ."],
     parallelism=6,
-    pytest_num_workers=16
+    pytest_num_workers=4
 )
 
 tokenization_job = CircleCIJob(
@@ -256,7 +256,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     install_steps=["uv venv && uv pip install ."],
     parallelism=6,
-    pytest_num_workers=16
+    pytest_num_workers=4
 )
 
 
@@ -265,7 +265,7 @@ def job_name(self):
     docker_image=[{"image":"huggingface/transformers-tf-light"}],
     install_steps=["uv venv", "uv pip install -e."],
     parallelism=6,
-    pytest_num_workers=16,
+    pytest_num_workers=4,
 )
 
 
@@ -274,7 +274,7 @@ def job_name(self):
     docker_image=[{"image":"huggingface/transformers-jax-light"}],
     install_steps=["uv venv && uv pip install ."],
     parallelism=6,
-    pytest_num_workers=16
+    pytest_num_workers=4
 )
 
 
diff --git a/awesome-transformers.md b/awesome-transformers.md
index 2ecdd3406f7095..d55e276841a3b0 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -596,7 +596,7 @@ Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active
 
 ## [BentoML](https://github.com/bentoml/BentoML)
 
-[BentoML](https://github.com/bentoml) is the unified framework for for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. 
+[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. 
 All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage.
 
 Keywords: BentoML, Framework, Deployment, AI Applications
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 62ba9c714c1da5..e8c83145e42f61 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -597,6 +597,8 @@
         title: DeiT
       - local: model_doc/depth_anything
         title: Depth Anything
+      - local: model_doc/depth_anything_v2
+        title: Depth Anything V2
       - local: model_doc/deta
         title: DETA
       - local: model_doc/detr
@@ -681,6 +683,8 @@
         title: ViTMSN
       - local: model_doc/yolos
         title: YOLOS
+      - local: model_doc/zoedepth
+        title: ZoeDepth
     - title: Audio models
       sections:
       - local: model_doc/audio-spectrogram-transformer
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index 2cacaed5902c4d..d1c550f5d32ea8 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -50,7 +50,7 @@ We implement two versions of ReactJsonAgent:
 
 ![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
 
-For example, here is how a ReAct agent would work its way through the following question.
+For example, here is how a ReAct Code agent would work its way through the following question.
 
 ```py3
 >>> agent.run(
@@ -188,7 +188,7 @@ You can still authorize additional imports by passing the authorized modules as
 >>> from transformers import ReactCodeAgent
 
 >>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>>agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
 
 (...)
 'Hugging Face – Blog'
@@ -256,6 +256,13 @@ agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_cus
 > Please make sure to define the `<<tool_descriptions>>` string somewhere in the `template` so the agent is aware 
 of the available tools.
 
+
+### Inspecting an agent run
+
+Here are a few useful attributes to inspect what happened after a run:
+- `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`.
+- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method.
+
 ## Tools
 
 A tool is an atomic function to be used by an agent.
@@ -379,7 +386,7 @@ And the output:
 `"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
 
 
-### Manage agent toolbox
+### Manage your agent's toolbox
 
 If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool.
 
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index df1843743e29b8..d840caaf660520 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -199,7 +199,8 @@ effect that `add_generation_prompt` has will depend on the template being used.
 
 ## Can I use chat templates in training?
 
-Yes! We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you
+Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training.
+We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you
 can simply continue like any other language model training task. When training, you should usually set 
 `add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during 
 training. Let's see an example:
@@ -233,6 +234,16 @@ The sun.</s>
 
 From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column.
 
+<Tip>
+If you format text with `apply_chat_template(tokenize=False)` and then tokenize it in a separate step, you should set the argument
+`add_special_tokens=False`. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
+
+By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should 
+always include all of the special tokens they need, and so adding extra special tokens with
+the default `add_special_tokens=True` can result in incorrect or duplicated special tokens, which will hurt model
+performance.
+</Tip>
+
 ## Advanced: Extra inputs to chat templates
 
 The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword
diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md
index 868021a9cd2e27..7f7995c4664133 100644
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@@ -16,11 +16,11 @@ rendered properly in your Markdown viewer.
 
 # DeepSpeed
 
-[DeepSpeed](https://www.deepspeed.ai/) is a PyTorch optimization library that makes distributed training memory-efficient and fast. At it's core is the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which enables training large models at scale. ZeRO works in several stages:
+[DeepSpeed](https://www.deepspeed.ai/) is a PyTorch optimization library that makes distributed training memory-efficient and fast. At its core is the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which enables training large models at scale. ZeRO works in several stages:
 
-* ZeRO-1, optimizer state partioning across GPUs
+* ZeRO-1, optimizer state partitioning across GPUs
 * ZeRO-2, gradient partitioning across GPUs
-* ZeRO-3, parameteter partitioning across GPUs
+* ZeRO-3, parameter partitioning across GPUs
 
 In GPU-limited environments, ZeRO also enables offloading optimizer memory and computation from the GPU to the CPU to fit and train really large models on a single GPU. DeepSpeed is integrated with the Transformers [`Trainer`] class for all ZeRO stages and offloading. All you need to do is provide a config file or you can use a provided template. For inference, Transformers support ZeRO-3 and offloading since it allows loading huge models.
 
@@ -159,7 +159,7 @@ There are three types of configuration parameters:
 
 You could also modify the DeepSpeed configuration and edit [`TrainingArguments`] from it:
 
-1. Create or load a DeepSpeed configuration to used as the main configuration
+1. Create or load a DeepSpeed configuration to use as the main configuration
 2. Create a [`TrainingArguments`] object based on these DeepSpeed configuration values
 
 Some values, such as `scheduler.params.total_num_steps` are calculated by the [`Trainer`] during training.
@@ -191,7 +191,7 @@ ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed
 </hfoption>
 <hfoption id="ZeRO-2">
 
-ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily used for training since it's features are not relevant to inference. Some important parameters to configure for better performance include:
+ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include:
 
 * `offload_optimizer` should be enabled to reduce GPU memory usage.
 * `overlap_comm` when set to `true` trades off increased GPU memory usage to lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error.
@@ -226,7 +226,7 @@ ZeRO-3 shards the optimizer, gradient, and parameters across GPUs. Unlike ZeRO-2
 * `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory.
 * `stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given time. Reduce this value if you encounter an OOM error.
 * `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is super helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. But reduce this value if you encounter an OOM error.
-* `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is an expensive in terms of memory and speed. You should enable it if you're planning on resuming training.
+* `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training.
 * `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory from during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you:
 
     1. Run into an OOM error during the optimizer step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers.
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index b000cc06779918..68430de643f17b 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -178,7 +178,7 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 
 The `generate()` method supports caching keys and values to enhance efficiency and avoid re-computations. However the key and value
 cache can occupy a large portion of memory, becoming a bottleneck for long-context generation, especially for Large Language Models.
-Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed. 
+Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
 
 KV Cache quantization in `transformers` is largely inspired by the paper [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache]
 (https://arxiv.org/abs/2402.02750) and currently supports `quanto` and `HQQ` as backends. For more information on the inner workings see the paper.
@@ -213,11 +213,11 @@ I like rock music because it's loud and energetic. I like to listen to it when I
 
 ## Watermarking
 
-The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green". 
+The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
 When generating the "green" will have a small 'bias' value added to their logits, thus having a higher chance to be generated.
 The watermarked text can be detected by calculating the proportion of "green" tokens in the text and estimating how likely it is
-statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper 
-["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on 
+statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper
+["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on
 the inner functioning of watermarking, it is recommended to refer to the paper.
 
 The watermarking can be used with any generative model in `tranformers` and does not require an extra classification model
@@ -484,3 +484,59 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 
 Alternativelly, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
 to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
+### DoLa Decoding
+
+**D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
+hallucinations of LLMs, as described in this paper of ICLR 2024 [DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models](https://arxiv.org/abs/2309.03883).
+
+DoLa is achieved by contrasting the differences in logits obtained from final
+layers versus earlier layers, thus amplify the factual knowledge localized to particular part of transformer layers.
+
+Do the following two steps to activate DoLa decoding when calling the `model.generate` function:
+1. Set the `dola_layers` argument, which can be either a string or a list of integers.
+    - If set to a string, it can be one of `low`, `high`.
+    - If set to a list of integers, it should be a list of layer indices between 0 and the total number of layers in the model. The 0-th layer is word embedding, and the 1st layer is the first transformer layer, and so on.
+2. Set `repetition_penalty = 1.2` is suggested to reduce repetition in DoLa decoding.
+
+See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+>>> import torch
+
+>>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16)
+>>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
+>>> model.to(device)
+>>> set_seed(42)
+
+>>> text = "On what date was the Declaration of Independence officially signed?"
+>>> inputs = tokenizer(text, return_tensors="pt").to(device)
+
+# Vanilla greddy decoding
+>>> vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
+>>> tokenizer.batch_decode(vanilla_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+['\nThe Declaration of Independence was signed on July 4, 1776.\nWhat was the date of the signing of the Declaration of Independence?\nThe Declaration of Independence was signed on July 4,']
+
+# DoLa decoding with contrasting higher part of layers (layers 16,18,...,30)
+>>> dola_high_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers='high')
+>>> tokenizer.batch_decode(dola_high_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+['\nJuly 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.']
+
+# DoLa decoding with contrasting specific layers (layers 28 and 30)
+>>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
+>>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+['\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2']
+```
+
+#### Understanding the `dola_layers` argument
+
+`dola_layers` stands for the candidate layers in premature layer selection, as described in the DoLa paper. The selected premature layer will be contrasted with the final layer.
+
+Setting `dola_layers` to `'low'` or `'high'` will select the lower or higher part of the layers to contrast, respectively.
+- For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)` are used for `'low'` and `'high'` layers, respectively.
+- For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for `'low'` and `'high'` layers, respectively.
+- If the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer, as the early exit from word embeddings will become identity function.
+- Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers. For example, setting `dola_layers=[28,30]` will contrast the final layer (32-th layer) with the 28-th and 30-th layers.
+
+The paper suggested that contrasting `'high'` layers to improve short-answer tasks like TruthfulQA, and contrasting `'low'` layers to improve all the other long-answer reasoning tasks, such as GSM8K, StrategyQA, FACTOR, and VicunaQA. Applying DoLa to smaller models like GPT-2 is not recommended, as the results shown in the Appendix N of the paper.
diff --git a/docs/source/en/glossary.md b/docs/source/en/glossary.md
index f3c2c50d705ab6..d9fdac2475f23b 100644
--- a/docs/source/en/glossary.md
+++ b/docs/source/en/glossary.md
@@ -139,7 +139,7 @@ reading the whole sentence with a mask to hide future tokens at a certain timest
 
 ### deep learning (DL)
 
-Machine learning algorithms which uses neural networks with several layers.
+Machine learning algorithms which use neural networks with several layers.
 
 ## E
 
@@ -519,4 +519,4 @@ A form of model training in which data provided to the model is not labeled. Uns
 Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensor-parallelism-tp), 
 except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need 
 to be modified. This method also supports various offloading techniques to compensate for limited GPU memory. 
-Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
\ No newline at end of file
+Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ac026067ac24b7..99aa40bf995325 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -343,5 +343,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
 |                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
 |                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
+|                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
 
 <!-- End table-->
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 5bf8b5c4a0b36f..da7ea25e54b6b0 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -391,6 +391,12 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - get_seq_length
     - reset
 
+[[autodoc]] EncoderDecoderCache
+    - get_seq_length
+    - to_legacy_cache
+    - from_legacy_cache
+    - reset
+    - reorder_cache
 
 ## Watermark Utils
 
diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md
index 93848d72b0d811..23086929f6d54a 100644
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -147,7 +147,7 @@ Let's call it now for the next experiment.
 ```python
 flush()
 ```
-In the recent version of the accelerate library, you can also use an utility method called `release_memory()`
+In the recent version of the accelerate library, you can also use a utility method called `release_memory()`
 
 ```python
 from accelerate.utils import release_memory
@@ -683,7 +683,7 @@ Assistant: Germany has ca. 81 million inhabitants
 
 In this chat, the LLM runs auto-regressive decoding twice:
   1. The first time, the key-value cache is empty and the input prompt is `"User: How many people live in France?"` and the model auto-regressively generates the text `"Roughly 75 million people live in France"` while increasing the key-value cache at every decoding step.
-  2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
+  2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, its computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
 
 Two things should be noted here:
   1. Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. E.g. for the example above the LLM needs to understand that the user refers to the population when asking `"And how many are in Germany"`.
diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md
index bc7323f5911ee6..ee91737ef05029 100644
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -34,7 +34,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
   or tensorboardX).
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
-- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
+- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
 - [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
 - [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
index d7a701700d135c..d5d132aaaba566 100644
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@@ -270,6 +270,11 @@ This is a simplified view, since the pipeline can handle automatically the batch
 about how many forward passes you inputs are actually going to trigger, you can optimize the `batch_size`
 independently of the inputs. The caveats from the previous section still apply.
 
+## Pipeline FP16 inference
+Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.
+
+To enable FP16 inference, you can simply pass `torch_dtype=torch.float16` or `torch_dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
+
 ## Pipeline custom code
 
 If you want to override a specific pipeline.
diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md
index 99332697b38ef2..e08e4bfc9904b7 100644
--- a/docs/source/en/model_doc/depth_anything.md
+++ b/docs/source/en/model_doc/depth_anything.md
@@ -20,6 +20,12 @@ rendered properly in your Markdown viewer.
 
 The Depth Anything model was proposed in [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao. Depth Anything is based on the [DPT](dpt) architecture, trained on ~62 million images, obtaining state-of-the-art results for both relative and absolute depth estimation.
 
+<Tip>
+
+[Depth Anything V2](depth_anything_v2) was released in June 2024. It uses the same architecture as Depth Anything and therefore it is compatible with all code examples and existing workflows. However, it leverages synthetic data and a larger capacity teacher model to achieve much finer and robust depth predictions.
+
+</Tip>
+
 The abstract from the paper is the following:
 
 *This work presents Depth Anything, a highly practical solution for robust monocular depth estimation. Without pursuing novel technical modules, we aim to build a simple yet powerful foundation model dealing with any images under any circumstances. To this end, we scale up the dataset by designing a data engine to collect and automatically annotate large-scale unlabeled data (~62M), which significantly enlarges the data coverage and thus is able to reduce the generalization error. We investigate two simple yet effective strategies that make data scaling-up promising. First, a more challenging optimization target is created by leveraging data augmentation tools. It compels the model to actively seek extra visual knowledge and acquire robust representations. Second, an auxiliary supervision is developed to enforce the model to inherit rich semantic priors from pre-trained encoders. We evaluate its zero-shot capabilities extensively, including six public datasets and randomly captured photos. It demonstrates impressive generalization ability. Further, through fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs are set. Our better depth model also results in a better depth-conditioned ControlNet.*
diff --git a/docs/source/en/model_doc/depth_anything_v2.md b/docs/source/en/model_doc/depth_anything_v2.md
new file mode 100644
index 00000000000000..49f655238efca6
--- /dev/null
+++ b/docs/source/en/model_doc/depth_anything_v2.md
@@ -0,0 +1,115 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Depth Anything V2
+
+## Overview
+
+Depth Anything V2 was introduced in [the paper of the same name](https://arxiv.org/abs/2406.09414) by Lihe Yang et al. It uses the same architecture as the original [Depth Anything model](depth_anything), but uses synthetic data and a larger capacity teacher model to achieve much finer and robust depth predictions.
+
+The abstract from the paper is the following:
+
+*This work presents Depth Anything V2. Without pursuing fancy techniques, we aim to reveal crucial findings to pave the way towards building a powerful monocular depth estimation model. Notably, compared with V1, this version produces much finer and more robust depth predictions through three key practices: 1) replacing all labeled real images with synthetic images, 2) scaling up the capacity of our teacher model, and 3) teaching student models via the bridge of large-scale pseudo-labeled real images. Compared with the latest models built on Stable Diffusion, our models are significantly more efficient (more than 10x faster) and more accurate. We offer models of different scales (ranging from 25M to 1.3B params) to support extensive scenarios. Benefiting from their strong generalization capability, we fine-tune them with metric depth labels to obtain our metric depth models. In addition to our models, considering the limited diversity and frequent noise in current test sets, we construct a versatile evaluation benchmark with precise annotations and diverse scenes to facilitate future research.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_anything_overview.jpg"
+alt="drawing" width="600"/>
+
+<small> Depth Anything overview. Taken from the <a href="https://arxiv.org/abs/2401.10891">original paper</a>.</small>
+
+The Depth Anything models were contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/DepthAnything/Depth-Anything-V2).
+
+## Usage example
+
+There are 2 main ways to use Depth Anything V2: either using the pipeline API, which abstracts away all the complexity for you, or by using the `DepthAnythingForDepthEstimation` class yourself.
+
+### Pipeline API
+
+The pipeline allows to use the model in a few lines of code:
+
+```python
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> # load pipe
+>>> pipe = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
+
+>>> # load image
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> # inference
+>>> depth = pipe(image)["depth"]
+```
+
+### Using the model yourself
+
+If you want to do the pre- and post-processing yourself, here's how to do that:
+
+```python
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+>>> import torch
+>>> import numpy as np
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
+>>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
+
+>>> # prepare image for the model
+>>> inputs = image_processor(images=image, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     predicted_depth = outputs.predicted_depth
+
+>>> # interpolate to original size
+>>> prediction = torch.nn.functional.interpolate(
+...     predicted_depth.unsqueeze(1),
+...     size=image.size[::-1],
+...     mode="bicubic",
+...     align_corners=False,
+... )
+
+>>> # visualize the prediction
+>>> output = prediction.squeeze().cpu().numpy()
+>>> formatted = (output * 255 / np.max(output)).astype("uint8")
+>>> depth = Image.fromarray(formatted)
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything.
+
+- [Monocular depth estimation task guide](../tasks/depth_estimation)
+- [Depth Anything V2 demo](https://huggingface.co/spaces/depth-anything/Depth-Anything-V2).
+- A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎
+- [Core ML conversion of the `small` variant for use on Apple Silicon](https://huggingface.co/apple/coreml-depth-anything-v2-small).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DepthAnythingConfig
+
+[[autodoc]] DepthAnythingConfig
+
+## DepthAnythingForDepthEstimation
+
+[[autodoc]] DepthAnythingForDepthEstimation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md
index fa16dfbc4ba0fc..5befa0b1f43777 100644
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@@ -19,12 +19,12 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Gemma2 model was proposed in [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/Gemma2-open-models/) by Gemma2 Team, Google.
-Gemma2 models are trained on 6T tokens, and released with 2 versions, 2b and 7b.
+The Gemma2 model was proposed in [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/google-gemma-2/) by Gemma2 Team, Google.
+Two Gemma2 models are released, with parameters sizes of 9 billion (9B) and 27 billion (27B).
 
-The abstract from the paper is the following:
+The abstract from the blog post is the following:
 
-*This work introduces Gemma2, a new family of open language models demonstrating strong performance across academic benchmarks for language understanding, reasoning, and safety. We release two sizes of models (2 billion and 7 billion parameters), and provide both pretrained and fine-tuned checkpoints. Gemma2 outperforms similarly sized open models on 11 out of 18 text-based tasks, and we present comprehensive evaluations of safety and responsibility aspects of the models, alongside a detailed description of our model development. We believe the responsible release of LLMs is critical for improving the safety of frontier models, and for enabling the next wave of LLM innovations*
+*Now we’re officially releasing Gemma 2 to researchers and developers globally. Available in both 9 billion (9B) and 27 billion (27B) parameter sizes, Gemma 2 is higher-performing and more efficient at inference than the first generation, with significant safety advancements built in. In fact, at 27B, it offers competitive alternatives to models more than twice its size, delivering the kind of performance that was only possible with proprietary models as recently as December.*
 
 Tips:
 
diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md
index 7e63e0c0887eea..764c3acb0674db 100644
--- a/docs/source/en/model_doc/prophetnet.md
+++ b/docs/source/en/model_doc/prophetnet.md
@@ -51,7 +51,7 @@ The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
 
 - ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
   the left.
-- The model architecture is based on the original Transformer, but replaces the “standard” self-attention mechanism in the decoder by a a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
+- The model architecture is based on the original Transformer, but replaces the “standard” self-attention mechanism in the decoder by a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
 
 ## Resources
 
diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md
index 6ee237942ab99d..4f46174fb187e8 100644
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@@ -27,7 +27,7 @@ The abstract from the paper is the following:
 ## Usage tips
 
 - Usage of SigLIP is similar to [CLIP](clip). The main difference is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
-- Training is not yet supported. If you want to fine-tune SigLIP or train from scratch, refer to the loss function from [OpenCLIP](https://github.com/mlfoundations/open_clip/blob/73ad04ae7fb93ede1c02dc9040a828634cb1edf1/src/open_clip/loss.py#L307), which leverages various `torch.distributed` utilities.
+- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
 - When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` as that's how the model was trained.
 - To get the same results as the pipeline, a prompt template of "This is a photo of {label}." should be used.
 
@@ -107,6 +107,88 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
+
+## Combining SigLIP and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import SiglipProcessor, SiglipModel
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="flash_attention_2",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+>>> processor = SiglipProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# follows the pipeline prompt template to get same results
+>>> candidate_labels = [f'This is a photo of {label}.' for label in candidate_labels]
+# important: we pass `padding=max_length` since the model was trained with this
+>>> inputs = processor(text=candidate_labels, images=image, padding="max_length", return_tensors="pt")
+>>> inputs.to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+51.3% that image 0 is 'This is a photo of 2 cats.'
+```
+
+
+## Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+You may set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. Make sure you have `torch>=2.1.1`.
+
+```python
+>>> from transformers import SiglipModel
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="sdpa",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+
+## Expected speedups
+
+Below is an expected speedup diagram that compares inference time between the native implementation in transformers using `google/siglip-so400m-patch14-384` checkpoint in `float16` precision and the Flash Attention 2 / SDPA version of the model using different batch sizes.
+
+<div style="text-align: center">
+<img src="https://i.imgur.com/cWm4rsn.png">
+</div>
+
+
 ## SiglipConfig
 
 [[autodoc]] SiglipConfig
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
index 992ff71735db34..0565bd5aae111b 100644
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@@ -52,8 +52,6 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 >>> # Select an audio file and read it:
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> audio_sample = ds[0]["audio"]
->>> waveform = audio_sample["array"]
->>> sampling_rate = audio_sample["sampling_rate"]
 
 >>> # Load the Whisper model in Hugging Face format:
 >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -61,7 +59,7 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 
 >>> # Use the model and processor to transcribe the audio:
 >>> input_features = processor(
-...     waveform, sampling_rate=sampling_rate, return_tensors="pt"
+...     audio_sample["array"], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
 ... ).input_features
 
 >>> # Generate token ids
@@ -74,6 +72,49 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
 ```
 
+Whisper is compatible with the following optimisations:
+- [PyTorch Scaled Dot Product Attention (SDPA)](../perf_infer_gpu_one#pytorch-scaled-dot-product-attention): flash attention and memory-efficient attention kernels. Enabled by default for `torch>=2.1.1`.
+- [Flash Attention 2](../perf_infer_gpu_one#flashattention-2): improved implementation of flash attention through better parallelism and work partitioning. 
+- [torch.compile](../llm_optims#static-kv-cache-and-torchcompile): JIT-compile the forward pass to dispatch to efficient fused kernels.
+
+As an example, the following codesnippet enables SDPA and `torch.compile` for up to 5x faster inference:
+
+```python
+>>> from datasets import load_dataset
+>>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
+
+>>> # Select an audio file and read it:
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> audio_sample = ds[0]["audio"]
+
+>>> # Load the Whisper model with SDPA attention
+>>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", attn_implementation="sdpa")
+
+>>> # Enable static cache and compile the forward pass
+>>> model.generation_config.cache_implementation = "static"
+>>> model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+
+>>> # Use the model and processor to transcribe the audio:
+>>> input_features = processor(
+...     audio_sample["array"], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
+... ).input_features
+
+>>> # Compile the forward pass
+>>> _ = model.generate(input_features)
+
+>>> # Generate token ids using compiled graph (fast!)
+>>> predicted_ids = model.generate(input_features)
+
+>>> # Decode token ids to text
+>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+
+>>> transcription[0]
+' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+```
+
+For more details on each optimisation, refer to the documentation linked above.
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Whisper. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md
new file mode 100644
index 00000000000000..d16da59ea98245
--- /dev/null
+++ b/docs/source/en/model_doc/zoedepth.md
@@ -0,0 +1,108 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ZoeDepth
+
+## Overview
+
+The ZoeDepth model was proposed in [ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth](https://arxiv.org/abs/2302.12288) by Shariq Farooq Bhat, Reiner Birkl, Diana Wofk, Peter Wonka, Matthias Müller. ZoeDepth extends the [DPT](dpt) framework for metric (also called absolute) depth estimation. ZoeDepth is pre-trained on 12 datasets using relative depth and fine-tuned on two domains (NYU and KITTI) using metric depth. A lightweight head is used with a novel bin adjustment design called metric bins module for each domain. During inference, each input image is automatically routed to the appropriate head using a latent classifier.
+
+The abstract from the paper is the following:
+
+*This paper tackles the problem of depth estimation from a single image. Existing work either focuses on generalization performance disregarding metric scale, i.e. relative depth estimation, or state-of-the-art results on specific datasets, i.e. metric depth estimation. We propose the first approach that combines both worlds, leading to a model with excellent generalization performance while maintaining metric scale. Our flagship model, ZoeD-M12-NK, is pre-trained on 12 datasets using relative depth and fine-tuned on two datasets using metric depth. We use a lightweight head with a novel bin adjustment design called metric bins module for each domain. During inference, each input image is automatically routed to the appropriate head using a latent classifier. Our framework admits multiple configurations depending on the datasets used for relative depth pre-training and metric fine-tuning. Without pre-training, we can already significantly improve the state of the art (SOTA) on the NYU Depth v2 indoor dataset. Pre-training on twelve datasets and fine-tuning on the NYU Depth v2 indoor dataset, we can further improve SOTA for a total of 21% in terms of relative absolute error (REL). Finally, ZoeD-M12-NK is the first model that can jointly train on multiple datasets (NYU Depth v2 and KITTI) without a significant drop in performance and achieve unprecedented zero-shot generalization performance to eight unseen datasets from both indoor and outdoor domains.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/zoedepth_architecture_bis.png"
+alt="drawing" width="600"/>
+
+<small> ZoeDepth architecture. Taken from the <a href="https://arxiv.org/abs/2302.12288">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/isl-org/ZoeDepth).
+
+## Usage tips
+
+- ZoeDepth is an absolute (also called metric) depth estimation model, unlike DPT which is a relative depth estimation model. This means that ZoeDepth is able to estimate depth in metric units like meters.
+
+The easiest to perform inference with ZoeDepth is by leveraging the [pipeline API](../main_classes/pipelines.md):
+
+```python
+from transformers import pipeline
+from PIL import Image
+import requests
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+pipe = pipeline(task="depth-estimation", model="Intel/zoedepth-nyu-kitti")
+result = pipe(image)
+depth = result["depth"]
+```
+
+Alternatively, one can also perform inference using the classes:
+
+```python
+from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
+import torch
+import numpy as np
+from PIL import Image
+import requests
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")
+
+# prepare image for the model
+inputs = image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predicted_depth = outputs.predicted_depth
+
+# interpolate to original size
+prediction = torch.nn.functional.interpolate(
+    predicted_depth.unsqueeze(1),
+    size=image.size[::-1],
+    mode="bicubic",
+    align_corners=False,
+)
+
+# visualize the prediction
+output = prediction.squeeze().cpu().numpy()
+formatted = (output * 255 / np.max(output)).astype("uint8")
+depth = Image.fromarray(formatted)
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ZoeDepth.
+
+- A demo notebook regarding inference with ZoeDepth models can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ZoeDepth). 🌎
+
+## ZoeDepthConfig
+
+[[autodoc]] ZoeDepthConfig
+
+## ZoeDepthImageProcessor
+
+[[autodoc]] ZoeDepthImageProcessor
+    - preprocess
+
+## ZoeDepthForDepthEstimation
+
+[[autodoc]] ZoeDepthForDepthEstimation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
index c42b58483bebd2..eb09ab439b443d 100644
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@@ -116,7 +116,7 @@ Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvid
 
 So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
 
-Let's compare the execution of a openai-community/gpt2 language model training over a small sample of wikitext.
+Let's compare the execution of an openai-community/gpt2 language model training over a small sample of wikitext.
 
 The results are:
 
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 1569bef1f6ba1f..b18e737ff97361 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -70,6 +70,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
+* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
@@ -231,6 +232,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
 * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
+* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
 * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index 990df0340bf1a6..5a72bba768d1d0 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -41,21 +41,22 @@ hyperparameter tuning, you should determine which batch size yields the best res
 
 The methods and tools covered in this guide can be classified based on the effect they have on the training process:
 
-| Method/tool                                                | Improves training speed | Optimizes memory utilization |
-|:-----------------------------------------------------------|:------------------------|:-----------------------------|
-| [Batch size choice](#batch-size-choice)                    | Yes                     | Yes                          |
-| [Gradient accumulation](#gradient-accumulation)            | No                      | Yes                          |
-| [Gradient checkpointing](#gradient-checkpointing)          | No                      | Yes                          |
-| [Mixed precision training](#mixed-precision-training)      | Yes                     | (No)                         |
-| [Optimizer choice](#optimizer-choice)                      | Yes                     | Yes                          |
-| [Data preloading](#data-preloading)                        | Yes                     | No                           |
-| [DeepSpeed Zero](#deepspeed-zero)                          | No                      | Yes                          |
-| [torch.compile](#using-torchcompile)                       | Yes                     | No                           |
-| [Parameter-Efficient Fine Tuning (PEFT)](#using--peft)            | No                      | Yes                          |
+| Method/tool                                                                                                                                             | Improves training speed | Optimizes memory utilization |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:-----------------------------|
+| [Batch size choice](#batch-size-choice)                                                                                                                 | Yes                     | Yes                          |
+| [Gradient accumulation](#gradient-accumulation)                                                                                                         | No                      | Yes                          |
+| [Gradient checkpointing](#gradient-checkpointing)                                                                                                       | No                      | Yes                          |
+| [Mixed precision training](#mixed-precision-training)                                                                                                   | Yes                     | Maybe*                       |
+| [torch_empty_cache_steps](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) | No                      | Yes                          |
+| [Optimizer choice](#optimizer-choice)                                                                                                                   | Yes                     | Yes                          |
+| [Data preloading](#data-preloading)                                                                                                                     | Yes                     | No                           |
+| [DeepSpeed Zero](#deepspeed-zero)                                                                                                                       | No                      | Yes                          |
+| [torch.compile](#using-torchcompile)                                                                                                                    | Yes                     | No                           |
+| [Parameter-Efficient Fine Tuning (PEFT)](#using--peft)                                                                                                  | No                      | Yes                          |
  
 <Tip>
 
-Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a 
+*Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a 
 large model and a small batch size, the memory use will be larger.
 
 </Tip>
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 8518f639ab9d3d..838b89432b4a37 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -113,7 +113,9 @@ This will work regardless of whether you are using PyTorch or Tensorflow.
 transcriber = pipeline(model="openai/whisper-large-v2", device=0)
 ```
 
-If the model is too large for a single GPU and you are using PyTorch, you can set `device_map="auto"` to automatically 
+If the model is too large for a single GPU and you are using PyTorch, you can set `torch_dtype='float16'` to enable FP16 precision inference. Usually this would not cause significant performance drops but make sure you evaluate it on your models!
+
+Alternatively, you can set `device_map="auto"` to automatically 
 determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate)
 package:
 
@@ -342,4 +344,3 @@ gr.Interface.from_pipeline(pipe).launch()
 
 By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
 link by setting `share=True` in `launch()`. You can also host your demo on [Hugging Face Spaces](https://huggingface.co/spaces) for a permanent link. 
-
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
index 82381057d3742b..1710fd5ecb8a5d 100644
--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@@ -471,7 +471,7 @@ from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images tog
 
 ## Multimodal
 
-For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor.
+For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as tokenizer and feature extractor.
 
 Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
 
diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md
index d3cc8f3c3c89be..1485cdadb48f13 100644
--- a/docs/source/en/tasks/monocular_depth_estimation.md
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@@ -23,23 +23,26 @@ a single camera viewpoint.
 Monocular depth estimation has various applications, including 3D reconstruction, augmented reality, autonomous driving,
 and robotics. It is a challenging task as it requires the model to understand the complex relationships between objects
 in the scene and the corresponding depth information, which can be affected by factors such as lighting conditions,
-occlusion, and texture.
+occlusion, and texture. 
 
-<Tip>
+There are two main depth estimation categories:
 
-To see all architectures and checkpoints compatible with this task, we recommend checking the [task-page](https://huggingface.co/tasks/depth-anything)
+- **Absolute depth estimation**: This task variant aims to provide exact depth measurements from the camera. The term is used interchangeably with metric depth estimation, where depth is provided in precise measurements in meters or feet. Absolute depth estimation models output depth maps with numerical values that represent real-world distances.
 
-</Tip>
+- **Relative depth estimation**: Relative depth estimation aims to predict the depth order of objects or points in a scene without providing the precise measurements. These models output a depth map that indicates which parts of the scene are closer or farther relative to each other without the actual distances to A and B.
 
-In this guide you'll learn how to:
+In this guide, we will see how to infer with [Depth Anything V2](https://huggingface.co/depth-anything/Depth-Anything-V2-Large), a state-of-the-art zero-shot relative depth estimation model, and [ZoeDepth](https://huggingface.co/docs/transformers/main/en/model_doc/zoedepth), an absolute depth estimation model.
 
-* create a depth estimation pipeline
-* run depth estimation inference by hand
+<Tip>
 
-Before you begin, make sure you have all the necessary libraries installed:
+Check the [Depth Estimation](https://huggingface.co/tasks/depth-estimation) task page to view all compatible architectures and checkpoints.
+
+</Tip>
+
+Before we begin, we need to install the latest version of Transformers:
 
 ```bash
-pip install -q transformers
+pip install -q -U transformers
 ```
 
 ## Depth estimation pipeline
@@ -49,9 +52,11 @@ Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggi
 
 ```py
 >>> from transformers import pipeline
+>>> import torch
 
->>> checkpoint = "vinvino02/glpn-nyu"
->>> depth_estimator = pipeline("depth-estimation", model=checkpoint)
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> checkpoint = "depth-anything/Depth-Anything-V2-base-hf"
+>>> pipe = pipeline("depth-estimation", model=checkpoint, device=device)
 ```
 
 Next, choose an image to analyze:
@@ -60,19 +65,19 @@ Next, choose an image to analyze:
 >>> from PIL import Image
 >>> import requests
 
->>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 >>> image
 ```
 
 <div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-estimation-example.jpg" alt="Photo of a busy street"/>
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" alt="Photo of a bee"/>
 </div>
 
 Pass the image to the pipeline.
 
 ```py
->>> predictions = depth_estimator(image)
+>>> predictions = pipe(image)
 ```
 
 The pipeline returns a dictionary with two entries. The first one, called `predicted_depth`, is a tensor with the values
@@ -99,17 +104,17 @@ Here we'll use the same checkpoint as before:
 ```py
 >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
 
->>> checkpoint = "vinvino02/glpn-nyu"
+>>> checkpoint = "Intel/zoedepth-nyu-kitti"
 
 >>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
->>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint)
+>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint).to(device)
 ```
 
 Prepare the image input for the model using the `image_processor` that will take care of the necessary image transformations
 such as resizing and normalization:
 
 ```py
->>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
 ```
 
 Pass the prepared inputs through the model:
@@ -119,28 +124,100 @@ Pass the prepared inputs through the model:
 
 >>> with torch.no_grad():
 ...     outputs = model(pixel_values)
-...     predicted_depth = outputs.predicted_depth
 ```
 
-Visualize the results:
+Let's post-process and visualize the results. 
+
+We need to pad and then resize the outputs so that predicted depth map has the same dimension as the original image. After resizing we will remove the padded regions from the depth. 
 
 ```py
 >>> import numpy as np
+>>> import torch.nn.functional as F
+
+>>> predicted_depth = outputs.predicted_depth.unsqueeze(dim=1)
+>>> height, width = pixel_values.shape[2:]
 
->>> # interpolate to original size
->>> prediction = torch.nn.functional.interpolate(
-...     predicted_depth.unsqueeze(1),
-...     size=image.size[::-1],
-...     mode="bicubic",
-...     align_corners=False,
-... ).squeeze()
->>> output = prediction.numpy()
-
->>> formatted = (output * 255 / np.max(output)).astype("uint8")
->>> depth = Image.fromarray(formatted)
->>> depth
+>>> height_padding_factor = width_padding_factor = 3
+>>> pad_h = int(np.sqrt(height/2) * height_padding_factor)
+>>> pad_w = int(np.sqrt(width/2) * width_padding_factor)
+
+>>> if predicted_depth.shape[-2:] != pixel_values.shape[-2:]:
+>>>    predicted_depth = F.interpolate(predicted_depth, size= (height, width), mode='bicubic', align_corners=False)
+
+>>> if pad_h > 0:
+     predicted_depth = predicted_depth[:, :, pad_h:-pad_h,:]
+>>> if pad_w > 0:
+     predicted_depth = predicted_depth[:, :, :, pad_w:-pad_w]
 ```
 
+We can now visualize the results (the function below is taken from the [GaussianObject](https://github.com/GaussianObject/GaussianObject/blob/ad6629efadb57902d5f8bc0fa562258029a4bdf1/pred_monodepth.py#L11) framework).
+
+```py
+import matplotlib
+
+def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
+    """Converts a depth map to a color image.
+
+    Args:
+        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
+        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
+        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
+        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
+        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
+        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
+        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
+        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
+
+    Returns:
+        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+
+    value = value.squeeze()
+    if invalid_mask is None:
+        invalid_mask = value == invalid_val
+    mask = np.logical_not(invalid_mask)
+
+    # normalize
+    vmin = np.percentile(value[mask],2) if vmin is None else vmin
+    vmax = np.percentile(value[mask],85) if vmax is None else vmax
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        # Avoid 0-division
+        value = value * 0.
+
+    # squeeze last dim if it exists
+    # grey out the invalid values
+
+    value[invalid_mask] = np.nan
+    cmapper = matplotlib.colormaps.get_cmap(cmap)
+    if value_transform:
+        value = value_transform(value)
+        # value = value / value.max()
+    value = cmapper(value, bytes=True)  # (nxmx4)
+
+    # img = value[:, :, :]
+    img = value[...]
+    img[invalid_mask] = background_color
+
+    #     return img.transpose((2, 0, 1))
+    if gamma_corrected:
+        # gamma correction
+        img = img / 255
+        img = np.power(img, 2.2)
+        img = img * 255
+        img = img.astype(np.uint8)
+    return img
+
+>>> result = colorize(predicted_depth.cpu().squeeze().numpy())
+>>> Image.fromarray(result)
+```
+
+
+
 <div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization-zoe.png" alt="Depth estimation visualization"/>
 </div>
diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md
index e26411f69e1e18..606cde849f1d1e 100644
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@@ -1011,7 +1011,7 @@ slow models to do qualitative testing. To see the use of these simply look for *
 grep tiny tests examples
 ```
 
-Here is a an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model
+Here is an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model
 [stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific
 model's architecture.
 
diff --git a/docs/source/ja/main_classes/callback.md b/docs/source/ja/main_classes/callback.md
index 3ea4938841e386..a90044b6cd3769 100644
--- a/docs/source/ja/main_classes/callback.md
+++ b/docs/source/ja/main_classes/callback.md
@@ -35,7 +35,7 @@ rendered properly in your Markdown viewer.
 - [`~integrations.TensorBoardCallback`] (PyTorch >= 1.4 を介して) tensorboard にアクセスできる場合
   またはテンソルボードX）。
 - [`~integrations.WandbCallback`] [wandb](https://www.wandb.com/) がインストールされている場合。
-- [`~integrations.CometCallback`] [comet_ml](https://www.comet.ml/site/) がインストールされている場合。
+- [`~integrations.CometCallback`] [comet_ml](https://www.comet.com/site/) がインストールされている場合。
 - [mlflow](https://www.mlflow.org/) がインストールされている場合は [`~integrations.MLflowCallback`]。
 - [`~integrations.NeptuneCallback`] [neptune](https://neptune.ai/) がインストールされている場合。
 - [`~integrations.AzureMLCallback`] [azureml-sdk](https://pypi.org/project/azureml-sdk/) の場合
diff --git a/docs/source/zh/main_classes/callback.md b/docs/source/zh/main_classes/callback.md
index be05c37aec9e73..3642207d75b951 100644
--- a/docs/source/zh/main_classes/callback.md
+++ b/docs/source/zh/main_classes/callback.md
@@ -28,7 +28,7 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]
 - [`PrinterCallback`] 或 [`ProgressCallback`]，用于显示进度和打印日志（如果通过[`TrainingArguments`]停用tqdm，则使用第一个函数；否则使用第二个）。
 - [`~integrations.TensorBoardCallback`]，如果TensorBoard可访问（通过PyTorch版本 >= 1.4 或者 tensorboardX）。
 - [`~integrations.WandbCallback`]，如果安装了[wandb](https://www.wandb.com/)。
-- [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.ml/site/)。
+- [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.com/site/)。
 - [`~integrations.MLflowCallback`]，如果安装了[mlflow](https://www.mlflow.org/)。
 - [`~integrations.NeptuneCallback`]，如果安装了[neptune](https://neptune.ai/)。
 - [`~integrations.AzureMLCallback`]，如果安装了[azureml-sdk](https://pypi.org/project/azureml-sdk/)。
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 178102ec092aeb..4e318b3edb920c 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -200,7 +200,7 @@ You can easily log and monitor your runs code. The following are currently suppo
 
 * [TensorBoard](https://www.tensorflow.org/tensorboard)
 * [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
-* [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)
+* [Comet ML](https://www.comet.com/docs/v2/integrations/ml-frameworks/transformers/)
 * [Neptune](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face)
 * [ClearML](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps)
 * [DVCLive](https://dvc.org/doc/dvclive/ml-frameworks/huggingface)
@@ -244,7 +244,7 @@ Additional configuration options are available through generic [wandb environmen
 
 Refer to related [documentation & examples](https://docs.wandb.ai/integrations/huggingface).
 
-### Comet.ml
+### Comet
 
 To use `comet_ml`, install the Python package with:
 
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 6053236f22c4b5..e2f3771b9bd248 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -20,7 +20,7 @@ boto3==1.16.34
 botocore==1.19.63
 Brotli==1.0.9
 cachetools==5.0.0
-certifi==2023.7.22
+certifi==2024.7.4
 cffi==1.15.0
 chardet==4.0.0
 charset-normalizer==2.0.12
@@ -237,4 +237,4 @@ Werkzeug==3.0.3
 wrapt==1.14.0
 xxhash==3.0.0
 yarl==1.7.2
-zipp==3.7.0
\ No newline at end of file
+zipp==3.19.1
\ No newline at end of file
diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 501854c029e307..5e2076be9ddf1f 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -4,7 +4,7 @@ async-generator==1.10
 attrs==20.2.0
 backcall==0.2.0
 CacheControl==0.12.6
-certifi==2023.7.22
+certifi==2024.7.4
 cffi==1.14.2
 chardet==3.0.4
 click==7.1.2
diff --git a/examples/tensorflow/language-modeling-tpu/requirements.txt b/examples/tensorflow/language-modeling-tpu/requirements.txt
index 60bbe767a21427..47ec780c02def9 100644
--- a/examples/tensorflow/language-modeling-tpu/requirements.txt
+++ b/examples/tensorflow/language-modeling-tpu/requirements.txt
@@ -1,3 +1,3 @@
-transformers==4.26.1
+transformers==4.38.0
 datasets==2.9.0
 tokenizers==0.13.2
diff --git a/examples/tensorflow/question-answering/README.md b/examples/tensorflow/question-answering/README.md
index 41cc8b7ef30c69..c7e85623199fbe 100644
--- a/examples/tensorflow/question-answering/README.md
+++ b/examples/tensorflow/question-answering/README.md
@@ -18,11 +18,12 @@ limitations under the License.
 
 This folder contains the `run_qa.py` script, demonstrating *question answering* with the 🤗 Transformers library.
 For straightforward use-cases you may be able to use this script without modification, although we have also
-included comments in the code to indicate areas that you may need to adapt to your own projects. 
+included comments in the code to indicate areas that you may need to adapt to your own projects.
 
 ### Usage notes
+
 Note that when contexts are long they may be split into multiple training cases, not all of which may contain
-the answer span. 
+the answer span.
 
 As-is, the example script will train on SQuAD or any other question-answering dataset formatted the same way, and can handle user
 inputs as well.
@@ -32,7 +33,7 @@ inputs as well.
 By default, the script uses a `MirroredStrategy` and will use multiple GPUs effectively if they are available. TPUs
 can also be used by passing the name of the TPU resource with the `--tpu` argument. There are some issues surrounding
 these strategies and our models right now, which are most likely to appear in the evaluation/prediction steps. We're
-actively working on better support for multi-GPU and TPU training in TF, but if you encounter problems a quick 
+actively working on better support for multi-GPU and TPU training in TF, but if you encounter problems a quick
 workaround is to train in the multi-GPU or TPU context and then perform predictions outside of it.
 
 ### Memory usage and data loading
@@ -40,16 +41,17 @@ workaround is to train in the multi-GPU or TPU context and then perform predicti
 One thing to note is that all data is loaded into memory in this script. Most question answering datasets are small
 enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
 data streaming. This is particularly challenging for TPUs, given the stricter requirements and the sheer volume of data
-required to keep them fed. A full explanation of all the possible pitfalls is a bit beyond this example script and 
-README, but for more information you can see the 'Input Datasets' section of 
+required to keep them fed. A full explanation of all the possible pitfalls is a bit beyond this example script and
+README, but for more information you can see the 'Input Datasets' section of
 [this document](https://www.tensorflow.org/guide/tpu).
 
 ### Example command
+
 ```bash
 python run_qa.py \
 --model_name_or_path distilbert/distilbert-base-cased \
 --output_dir output \
 --dataset_name squad \
 --do_train \
---do_eval \
+--do_eval
 ```
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c559ed61acad03..c6679fa2f29428 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -807,6 +807,7 @@
     "models.xmod": ["XmodConfig"],
     "models.yolos": ["YolosConfig"],
     "models.yoso": ["YosoConfig"],
+    "models.zoedepth": ["ZoeDepthConfig"],
     "onnx": [],
     "pipelines": [
         "AudioClassificationPipeline",
@@ -1182,6 +1183,7 @@
     _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
+    _import_structure["models.zoedepth"].append("ZoeDepthImageProcessor")
 
 try:
     if not is_torchvision_available():
@@ -1212,6 +1214,7 @@
         "Cache",
         "CacheConfig",
         "DynamicCache",
+        "EncoderDecoderCache",
         "HQQQuantizedCache",
         "QuantizedCache",
         "QuantizedCacheConfig",
@@ -3585,6 +3588,12 @@
             "YosoPreTrainedModel",
         ]
     )
+    _import_structure["models.zoedepth"].extend(
+        [
+            "ZoeDepthForDepthEstimation",
+            "ZoeDepthPreTrainedModel",
+        ]
+    )
     _import_structure["optimization"] = [
         "Adafactor",
         "AdamW",
@@ -5496,6 +5505,7 @@
     from .models.xmod import XmodConfig
     from .models.yolos import YolosConfig
     from .models.yoso import YosoConfig
+    from .models.zoedepth import ZoeDepthConfig
 
     # Pipelines
     from .pipelines import (
@@ -5871,6 +5881,7 @@
         from .models.vitmatte import VitMatteImageProcessor
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
+        from .models.zoedepth import ZoeDepthImageProcessor
 
     try:
         if not is_torchvision_available():
@@ -5895,6 +5906,7 @@
             Cache,
             CacheConfig,
             DynamicCache,
+            EncoderDecoderCache,
             HQQQuantizedCache,
             QuantizedCache,
             QuantizedCacheConfig,
@@ -7796,6 +7808,10 @@
             YosoModel,
             YosoPreTrainedModel,
         )
+        from .models.zoedepth import (
+            ZoeDepthForDepthEstimation,
+            ZoeDepthPreTrainedModel,
+        )
 
         # Optimization
         from .optimization import (
diff --git a/src/transformers/agents/agent_types.py b/src/transformers/agents/agent_types.py
index 87255dc7dec98a..0b4999b7f76d3c 100644
--- a/src/transformers/agents/agent_types.py
+++ b/src/transformers/agents/agent_types.py
@@ -188,7 +188,7 @@ def __init__(self, value, samplerate=16_000):
         self.samplerate = samplerate
         if isinstance(value, (str, pathlib.Path)):
             self._path = value
-        elif isinstance(value, torch.Tensor):
+        elif is_torch_available() and isinstance(value, torch.Tensor):
             self._tensor = value
         elif isinstance(value, tuple):
             self.samplerate = value[0]
@@ -232,7 +232,10 @@ def to_string(self):
 
 
 AGENT_TYPE_MAPPING = {"text": AgentText, "image": AgentImage, "audio": AgentAudio}
-INSTANCE_TYPE_MAPPING = {str: AgentText, float: AgentText, int: AgentText, Tensor: AgentAudio, ImageType: AgentImage}
+INSTANCE_TYPE_MAPPING = {str: AgentText, ImageType: AgentImage}
+
+if is_torch_available():
+    INSTANCE_TYPE_MAPPING[Tensor] = AgentAudio
 
 
 def handle_agent_inputs(*args, **kwargs):
@@ -251,4 +254,4 @@ def handle_agent_outputs(output, output_type=None):
         for _k, _v in INSTANCE_TYPE_MAPPING.items():
             if isinstance(output, _k):
                 return _v(output)
-        return AgentType(output)
+        return output
diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py
index 63a2c3889ba842..1ddfb6b4174777 100644
--- a/src/transformers/agents/agents.py
+++ b/src/transformers/agents/agents.py
@@ -337,6 +337,7 @@ def __init__(
                 self._toolbox.add_base_tools(add_python_interpreter=(self.__class__ == ReactJsonAgent))
         else:
             self._toolbox = Toolbox(tools, add_base_tools=add_base_tools)
+        self._toolbox.add_tool(FinalAnswerTool())
 
         self.system_prompt = format_prompt_with_tools(
             self._toolbox, self.system_prompt_template, self.tool_description_template
@@ -631,8 +632,6 @@ def __init__(
             tool_description_template=tool_description_template,
             **kwargs,
         )
-        if "final_answer" not in self._toolbox.tools:
-            self._toolbox.add_tool(FinalAnswerTool())
 
     def provide_final_answer(self, task) -> str:
         """
@@ -857,6 +856,10 @@ def __init__(
         self.additional_authorized_imports = additional_authorized_imports if additional_authorized_imports else []
         self.authorized_imports = list(set(LIST_SAFE_MODULES) | set(self.additional_authorized_imports))
         self.system_prompt = self.system_prompt.replace("<<authorized_imports>>", str(self.authorized_imports))
+        self.available_tools = {
+            **BASE_PYTHON_TOOLS.copy(),
+            **self.toolbox.tools,
+        }  # This list can be augmented by the code agent creating some new functions
 
     def step(self):
         """
@@ -906,10 +909,9 @@ def step(self):
         # Execute
         self.log_code_action(code_action)
         try:
-            available_tools = {**BASE_PYTHON_TOOLS.copy(), **self.toolbox.tools}
             result = self.python_evaluator(
                 code_action,
-                available_tools,
+                tools=self.available_tools,
                 state=self.state,
                 authorized_imports=self.authorized_imports,
             )
diff --git a/src/transformers/agents/default_tools.py b/src/transformers/agents/default_tools.py
index 6ab971a4803c32..4a6f6dad3dca13 100644
--- a/src/transformers/agents/default_tools.py
+++ b/src/transformers/agents/default_tools.py
@@ -180,7 +180,7 @@ def forward(self, code):
 
 class FinalAnswerTool(Tool):
     name = "final_answer"
-    description = "Provides a final answer to the given problem"
+    description = "Provides a final answer to the given problem."
     inputs = {"answer": {"type": "text", "description": "The final answer to the problem"}}
     output_type = "any"
 
diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py
index 3a867e8dc9bfe0..515ce3d439a499 100644
--- a/src/transformers/agents/prompts.py
+++ b/src/transformers/agents/prompts.py
@@ -52,8 +52,9 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
 You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
 Each instruction in Python should be a simple assignment. You can print intermediate results if it makes sense to do so.
+In the end, use tool 'final_answer' to return your answer, its argument will be what gets returned.
 You can use imports in your code, but only from the following list of modules: <<authorized_imports>>
-Be sure to provide a 'Code:' token, else the system will be stuck in a loop.
+Be sure to provide a 'Code:' token, else the run will fail.
 
 Tools:
 <<tool_descriptions>>
@@ -68,7 +69,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 translated_question = translator(question=question, src_lang="French", tgt_lang="English")
 print(f"The translated question is {translated_question}.")
 answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
+final_answer(f"The answer is {answer}")
 ```<end_action>
 
 ---
@@ -80,6 +81,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 answer = document_qa(document, question="What is the oldest person?")
 print(f"The answer is {answer}.")
 image = image_generator(answer)
+final_answer(image)
 ```<end_action>
 
 ---
@@ -89,6 +91,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 Code:
 ```py
 image = image_generator(prompt=caption)
+final_answer(image)
 ```<end_action>
 
 ---
@@ -100,6 +103,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 summarized_text = summarizer(text)
 print(f"Summary: {summarized_text}")
 audio_summary = text_reader(summarized_text)
+final_answer(audio_summary)
 ```<end_action>
 
 ---
@@ -111,6 +115,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 answer = text_qa(text=text, question=question)
 print(f"The answer is {answer}.")
 image = image_generator(answer)
+final_answer(image)
 ```<end_action>
 
 ---
@@ -120,10 +125,11 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 Code:
 ```py
 caption = image_captioner(image)
+final_answer(caption)
 ```<end_action>
 
 ---
-Above example were using tools that might not exist for you. You only have access to those Tools:
+Above example were using tools that might not exist for you. You only have acces to those Tools:
 <<tool_names>>
 
 Remember to make sure that variables you use are all defined.
@@ -250,7 +256,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 }<end_action>
 
 
-Above example were using notional tools that might not exist for you. You only have access to those tools:
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
 <<tool_descriptions>>
 
 Here are the rules you should always follow to solve your task:
@@ -357,7 +363,9 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
 5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
 6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
-7. You can use imports in your code, but only from the following list of modules: <<authorized_imports>>
+7. Never create any notional variables in our code, as having these in your logs might derail you from the true variables.
+8. You can use imports in your code, but only from the following list of modules: <<authorized_imports>>
+9. Don't give up! You're in charge of solving the task, not providing directions to solve it.
 
 Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
 """
diff --git a/src/transformers/agents/python_interpreter.py b/src/transformers/agents/python_interpreter.py
index 04f62a8acfb959..1235bb95c3ae02 100644
--- a/src/transformers/agents/python_interpreter.py
+++ b/src/transformers/agents/python_interpreter.py
@@ -778,7 +778,10 @@ def evaluate_ast(
 
 
 def evaluate_python_code(
-    code: str, tools: Optional[Dict[str, Callable]] = {}, state=None, authorized_imports: List[str] = LIST_SAFE_MODULES
+    code: str,
+    tools: Optional[Dict[str, Callable]] = None,
+    state: Optional[Dict[str, Any]] = None,
+    authorized_imports: List[str] = LIST_SAFE_MODULES,
 ):
     """
     Evaluate a python expression using the content of the variables stored in a state and only evaluating a given set
@@ -803,6 +806,8 @@ def evaluate_python_code(
         raise SyntaxError(f"The code generated by the agent is not valid.\n{e}")
     if state is None:
         state = {}
+    if tools is None:
+        tools = {}
     result = None
     global PRINT_OUTPUTS
     PRINT_OUTPUTS = ""
diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py
index b2821aa5c5726c..b8b20d8e0e6c3c 100644
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@@ -650,7 +650,6 @@ def fn(*args, **kwargs):
     "text-to-speech": "TextToSpeechTool",
     "translation": "TranslationTool",
     "python_interpreter": "PythonInterpreterTool",
-    "final_answer": "FinalAnswerTool",
 }
 
 
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index d572b8c8c71636..1f5a164815aaed 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -858,8 +858,12 @@ def update(
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
 
-        k_out[:, :, cache_position] = key_states
-        v_out[:, :, cache_position] = value_states
+        if cache_position is None:
+            k_out.copy_(key_states)
+            v_out.copy_(value_states)
+        else:
+            k_out[:, :, cache_position] = key_states
+            v_out[:, :, cache_position] = value_states
 
         return k_out, v_out
 
@@ -971,6 +975,158 @@ def get_max_length(self) -> Optional[int]:
         # no matter how long the sentence is
         return None
 
+    def reset(self):
+        self.key_cache.zero_()
+        self.value_cache.zero_()
+
+
+class EncoderDecoderCache(Cache):
+    """
+    Base, abstract class for all encoder-decoder caches. Can be used to hold combinations of self-attention and
+    cross-attention caches.
+    """
+
+    def __init__(self, self_attention_cache: Cache, cross_attention_cache: Cache):
+        self.self_attention_cache = self_attention_cache
+        self.cross_attention_cache = cross_attention_cache
+
+        self.is_updated = {}
+        for layer_idx in range(len(cross_attention_cache.key_cache)):
+            self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
+
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (
+                self.self_attention_cache.key_cache[layer_idx],
+                self.self_attention_cache.value_cache[layer_idx],
+                self.cross_attention_cache.key_cache[layer_idx],
+                self.cross_attention_cache.key_cache[layer_idx],
+            )
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.self_attention_cache)
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        """Converts the `EncoderDecoderCache` instance into  its equivalent in the legacy cache format."""
+        legacy_cache = ()
+        if len(self.cross_attention_cache) > 0:
+            for self_attn, cross_attn in zip(
+                self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache()
+            ):
+                legacy_cache += (self_attn + cross_attn,)
+        else:
+            legacy_cache = self.self_attention_cache.to_legacy_cache()
+        return legacy_cache
+
+    @classmethod
+    def from_legacy_cache(
+        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    ) -> "EncoderDecoderCache":
+        """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
+        cache = cls(self_attention_cache=DynamicCache(), cross_attention_cache=DynamicCache())
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states = past_key_values[layer_idx][:2]
+                cache.self_attention_cache.update(key_states, value_states, layer_idx)
+                if len(past_key_values[layer_idx]) > 2:
+                    key_states, value_states = past_key_values[layer_idx][2:]
+                    cache.cross_attention_cache.update(key_states, value_states, layer_idx)
+                    cache.is_updated[layer_idx] = True
+        return cache
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.self_attention_cache.key_cache) <= layer_idx:
+            return 0
+        return (self.self_attention_cache.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
+
+    def reset(self):
+        if hasattr(self.self_attention_cache, "reset"):
+            self.self_attention_cache.reset()
+        if hasattr(self.cross_attention_cache, "reset"):
+            self.cross_attention_cache.reset()
+        elif not hasattr(self.self_attention_cache, "reset") and not hasattr(self.cross_attention_cache, "reset"):
+            raise ValueError(
+                "Neither self nor cross-attention cache have valid `.reset()` methods. `.reset()` should "
+                "only be called on compatible cache classes, such as `StaticCache` or `SlidingWindowCache`. "
+                f"Got {self.self_attention_cache.__str__()} for the self attention cache and "
+                f"{self.cross_attention_cache.__str__()} for the cross attention cache."
+            )
+        for layer_idx in self.is_updated:
+            self.is_updated[layer_idx] = False
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        self.self_attention_cache.reorder_cache(beam_idx)
+        self.cross_attention_cache.reorder_cache(beam_idx)
+
+    def check_dynamic_cache(self, method: str):
+        if not (
+            isinstance(self.self_attention_cache, DynamicCache)
+            and isinstance(self.cross_attention_cache, DynamicCache)
+        ):
+            raise ValueError(
+                f"`{method}` is only defined for dynamic cache, got {self.self_attention_cache.__str__()} for the self "
+                f"attention cache and {self.cross_attention_cache.__str__()} for the cross attention cache."
+            )
+
+    # TODO(gante, sanchit-gandhi): move following functionality into `.generate`
+    def crop(self, maximum_length: int):
+        """Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be
+        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search."""
+        self.check_dynamic_cache(self.crop.__name__)
+        self.self_attention_cache.crop(maximum_length)
+
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
+        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
+        `_split_model_inputs()` in `generation.utils`"""
+        self.check_dynamic_cache(self.batch_split.__name__)
+        self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size)
+        cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size)
+
+        out = []
+        for self_attn, cross_attn in zip(self_attention_cache, cross_attention_cache):
+            out.append(EncoderDecoderCache(self_attn, cross_attn))
+        return out
+
+    @classmethod
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
+        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
+        `generation.utils`"""
+        self_attention_cache = DynamicCache()
+        cross_attention_cache = DynamicCache()
+        for idx in range(len(splits[0])):
+            layer_keys = torch.cat([current.self_attention_cache.key_cache[idx] for current in splits], dim=0)
+            layer_values = torch.cat([current.self_attention_cache.value_cache[idx] for current in splits], dim=0)
+            self_attention_cache.update(layer_keys, layer_values, idx)
+
+            layer_keys = torch.cat([current.cross_attention_cache.key_cache[idx] for current in splits], dim=0)
+            layer_values = torch.cat([current.cross_attention_cache.value_cache[idx] for current in splits], dim=0)
+            cross_attention_cache.update(layer_keys, layer_values, idx)
+        return cls(self_attention_cache, cross_attention_cache)
+
+    def batch_repeat_interleave(self, repeats: int):
+        """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search."""
+        self.check_dynamic_cache(self.batch_repeat_interleave.__name__)
+        self.self_attention_cache.batch_repeat_interleave(repeats)
+        self.cross_attention_cache.batch_repeat_interleave(repeats)
+
+    def batch_select_indices(self, indices: torch.Tensor):
+        """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search."""
+        self.check_dynamic_cache(self.batch_select_indices.__name__)
+        self.self_attention_cache.batch_select_indices(indices)
+        self.cross_attention_cache.batch_select_indices(indices)
+
 
 class HybridCache(Cache):
     def __init__(self, config: PretrainedConfig, max_batch_size, max_cache_len, device="cpu", dtype=None) -> None:
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index 626e8373192a6c..e4b2f9be5cf3dc 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -1628,7 +1628,7 @@ def get_user_input():
     )
 
     old_processing_classes = [
-        c
+        c if not isinstance(c, tuple) else c[0]
         for c in [old_image_processor_class, old_feature_extractor_class, old_tokenizer_class, old_processor_class]
         if c is not None
     ]
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 8ba17a6a350f78..dcdccad23a54c1 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -60,6 +60,7 @@ class GenerationMode(ExplicitEnum):
     GREEDY_SEARCH = "greedy_search"
     SAMPLE = "sample"
     ASSISTED_GENERATION = "assisted_generation"
+    DOLA_GENERATION = "dola_generation"
     # Beam methods
     BEAM_SEARCH = "beam_search"
     BEAM_SAMPLE = "beam_sample"
@@ -81,6 +82,7 @@ class GenerationConfig(PushToHubMixin):
         - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
         - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
+        - *dola decoding* if `dola_layers` is passed to `.generate()`
 
     To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
 
@@ -305,6 +307,18 @@ class GenerationConfig(PushToHubMixin):
         max_matching_ngram_size (`int`, *optional*, default to `None`):
             The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
 
+        > Generation parameters exclusive to [DoLa decoding](https://arxiv.org/abs/2309.03883)
+
+        dola_layers (`str` or `List[int]`, *optional*):
+            The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
+            be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
+            "low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
+            layers up to the last 20 layers.
+            If a list of integers, it must contain the indices of the layers to use for candidate premature layers in DoLa.
+            The 0-th layer is the word embedding layer of the model. Set to `'low'` to improve long-answer reasoning tasks,
+            `'high'` to improve short-answer tasks. Check the [documentation](https://github.com/huggingface/transformers/blob/main/docs/source/en/generation_strategies.md)
+            or [the paper](https://arxiv.org/abs/2309.03883) for more details.
+
         > Parameters specific to the caching mechanism:
 
         cache_implementation (`str`, *optional*, default to `None`):
@@ -397,6 +411,9 @@ def __init__(self, **kwargs):
         self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 5)
         self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "heuristic")
 
+        # DoLa generation
+        self.dola_layers = kwargs.pop("dola_layers", None)
+
         # Cache implementation
         self.cache_implementation = kwargs.pop("cache_implementation", None)
         self.cache_config = kwargs.pop("cache_config", None)
@@ -495,6 +512,16 @@ def get_generation_mode(self, assistant_model: Optional["PreTrainedModel"] = Non
                     "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
                     "is only supported with Greedy Search and Sample."
                 )
+
+        # DoLa generation may extend some generation modes
+        if self.dola_layers is not None:
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.DOLA_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `dola_layers`, which triggers DoLa generate. Currently, DoLa generate "
+                    "is only supported with Greedy Search and Sample."
+                )
         return generation_mode
 
     def validate(self, is_init=False):
@@ -700,6 +727,17 @@ def validate(self, is_init=False):
                     "`generate()` (or a pipeline) directly."
                 )
 
+        # 6. if dola_layers is set, check if repetition_penalty is set to >= 1.2
+        if self.dola_layers is not None and (self.repetition_penalty is None or self.repetition_penalty < 1.2):
+            dola_decoding_wrong_parameter_msg = (
+                "`dola_layers` is set to trigger DoLa decoding, but `repetition_penalty` is set to a value of {repetition_penalty}, "
+                "which could induce unwanted repetition. The recommended value for DoLa decoding is `repetition_penalty>=1.2`."
+            )
+            warnings.warn(
+                dola_decoding_wrong_parameter_msg.format(repetition_penalty=self.repetition_penalty),
+                UserWarning,
+            )
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 9c69bb35d264fe..6b4a055fba8de3 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -20,13 +20,16 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.distributed as dist
 from torch import nn
+from torch.nn import functional as F
 
 from ..cache_utils import (
     Cache,
     DynamicCache,
+    EncoderDecoderCache,
     HQQQuantizedCache,
     HybridCache,
     QuantizedCacheConfig,
@@ -1409,7 +1412,7 @@ def _get_initial_cache_position(self, input_ids, model_kwargs):
         model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device)
         return model_kwargs
 
-    def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int) -> Cache:
+    def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int, model_kwargs) -> Cache:
         """
         Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
         new `generate` call requires a larger cache.
@@ -1417,28 +1420,46 @@ def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_l
         Returns the resulting cache object.
         """
         cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
+        requires_cross_attention_cache = (
+            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+        )
+
+        if hasattr(self, "_cache"):
+            cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
+
         if cache_implementation == "sliding_window":
             max_cache_len = min(self.config.sliding_window, max_cache_len)
 
         need_new_cache = (
             not hasattr(self, "_cache")
-            or (not isinstance(self._cache, cache_cls))
-            or self._cache.max_batch_size != max_batch_size
-            or self._cache.max_cache_len < max_cache_len
+            or (not isinstance(cache_to_check, cache_cls))
+            or cache_to_check.max_batch_size != max_batch_size
+            or cache_to_check.max_cache_len < max_cache_len
         )
 
+        if requires_cross_attention_cache and hasattr(self, "_cache"):
+            need_new_cache = (
+                need_new_cache
+                or self._cache.cross_attention_cache.max_cache_len != model_kwargs["encoder_outputs"][0].shape[1]
+            )
+
         if need_new_cache:
             if hasattr(self.config, "_pre_quantization_dtype"):
                 cache_dtype = self.config._pre_quantization_dtype
             else:
                 cache_dtype = self.dtype
-            self._cache = cache_cls(
-                config=self.config,
-                max_batch_size=max_batch_size,
-                max_cache_len=max_cache_len,
-                device=self.device,
-                dtype=cache_dtype,
-            )
+            cache_kwargs = {
+                "config": self.config,
+                "max_batch_size": max_batch_size,
+                "max_cache_len": max_cache_len,
+                "device": self.device,
+                "dtype": cache_dtype,
+            }
+            self._cache = cache_cls(**cache_kwargs)
+            if requires_cross_attention_cache:
+                encoder_kwargs = cache_kwargs.copy()
+                encoder_kwargs["max_cache_len"] = model_kwargs["encoder_outputs"][0].shape[1]
+                self._cache = EncoderDecoderCache(self._cache, cache_cls(**encoder_kwargs))
         else:
             self._cache.reset()
         return self._cache
@@ -1474,8 +1495,11 @@ def _tensor_or_none(token_kwargs, token_self, device=None):
                 device = self.device
 
             token = token_kwargs if token_kwargs is not None else token_self
-            if token is None or isinstance(token, torch.Tensor):
+            if token is None:
                 return token
+            elif isinstance(token, torch.Tensor):
+                return token.to(device)
+
             return torch.tensor(token, device=device, dtype=torch.long)
 
         bos_token_id = _tensor_or_none(
@@ -1745,6 +1769,7 @@ def generate(
                     generation_config.cache_implementation,
                     getattr(generation_config, "num_beams", 1) * batch_size,
                     generation_config.max_length,
+                    model_kwargs,
                 )
             elif generation_config.cache_implementation == "quantized":
                 if not self._supports_quantized_cache:
@@ -1776,11 +1801,22 @@ def generate(
         # keeps copying the cache thus using much more memory
         elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache():
             past = model_kwargs.get("past_key_values", None)
+            requires_cross_attention_cache = (
+                self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+            )
             if past is None:
-                model_kwargs["past_key_values"] = DynamicCache()
+                model_kwargs["past_key_values"] = (
+                    DynamicCache()
+                    if not requires_cross_attention_cache
+                    else EncoderDecoderCache(DynamicCache(), DynamicCache())
+                )
                 use_dynamic_cache_by_default = True
             elif isinstance(past, tuple):
-                model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past)
+                model_kwargs["past_key_values"] = (
+                    DynamicCache.from_legacy_cache(past)
+                    if not requires_cross_attention_cache
+                    else EncoderDecoderCache.from_legacy_cache(past)
+                )
                 use_dynamic_cache_by_default = True
 
         self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
@@ -1874,6 +1910,28 @@ def generate(
                 streamer=streamer,
                 **model_kwargs,
             )
+        elif generation_mode == GenerationMode.DOLA_GENERATION:
+            if self._is_stateful:
+                # DoLa decoding was not designed for stateful models, and would require some changes
+                raise ValueError(
+                    f"dola decoding is not supported with stateful models, such as {self.__class__.__name__}"
+                )
+            prepared_logits_warper = (
+                self._get_logits_warper(generation_config, device=input_ids.device)
+                if generation_config.do_sample
+                else None
+            )
+            result = self._dola_decoding(
+                input_ids,
+                dola_layers=generation_config.dola_layers,
+                logits_processor=prepared_logits_processor,
+                logits_warper=prepared_logits_warper,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
 
         elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
             if not model_kwargs["use_cache"]:
@@ -2064,7 +2122,7 @@ def typeerror():
         # Convert to legacy cache if needed
         if use_dynamic_cache_by_default and generation_config.return_legacy_cache:
             if isinstance(result, ModelOutput) and hasattr(result, "past_key_values"):
-                if isinstance(result.past_key_values, DynamicCache):
+                if isinstance(result.past_key_values, (DynamicCache, EncoderDecoderCache)):
                     result.past_key_values = result.past_key_values.to_legacy_cache()
         return result
 
@@ -2155,6 +2213,225 @@ def contrastive_search(self, *args, **kwargs):
         )
         return self._contrastive_search(*args, **kwargs)
 
+    def _dola_decoding(
+        self,
+        input_ids: torch.LongTensor,
+        dola_layers: Union[str, List[int]],
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
+        used for decoder-only text models.
+        The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
+        Models" (https://arxiv.org/abs/2309.03883) in ICLR 2024.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            dola_layers (`Union[str, List[int]]`):
+                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
+                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
+                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
+            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+
+        if self.config.is_encoder_decoder:
+            raise ValueError("DoLa decoding is only available for decoder-only models.")
+        # init values
+
+        pad_token_id = generation_config.pad_token_id
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
+            raise ValueError(
+                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
+                f"{logits_warper})."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # keep track of which sequences are already finished
+        batch_size = input_ids.shape[0]
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        this_peer_finished = False
+
+        # prepare layers for DoLa decoding
+        final_layer = self.config.num_hidden_layers
+        # if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer,
+        # as the early exit from word embeddings will become identity function
+        # if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th
+        # layer otherwise. Notice that DoLa does not help shallow models much.
+        if not self.config.tie_word_embeddings:
+            start_layer = 0
+        elif final_layer > 2:
+            start_layer = 2
+        elif final_layer == 2:
+            start_layer = 1
+        else:
+            start_layer = 0
+
+        # For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)`
+        # are used for `'low'` and `'high'` layers, respectively.
+        # For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for
+        # `'low'` and `'high'` layers, respectively.
+        if isinstance(dola_layers, str) and dola_layers == "low":
+            if start_layer == final_layer // 2:
+                candidate_premature_layers = [start_layer]
+            else:
+                candidate_premature_layers = (
+                    list(range(start_layer, final_layer // 2, 2))
+                    if final_layer <= 40
+                    else list(range(start_layer, 20, 2))
+                )
+        elif isinstance(dola_layers, str) and dola_layers == "high":
+            candidate_premature_layers = (
+                list(range(final_layer // 2, final_layer, 2))
+                if final_layer <= 40
+                else list(range(final_layer - 20, final_layer, 2))
+            )
+        # Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers.
+        elif isinstance(dola_layers, list):
+            candidate_premature_layers = [i for i in dola_layers if i < final_layer]
+        else:
+            raise ValueError("dola_layers must be either 'low', 'high' or a list of integers.")
+
+        lm_head = self.get_output_embeddings()
+        if lm_head is None:
+            raise ValueError("DoLa is not supported for models that don't have output embeddings.")
+
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=True,
+            )
+
+            final_layer_next_token_logits = outputs.logits[:, -1, :].detach().clone()
+            final_logits = outputs.logits[:, -1, :]
+            candidate_premature_logits = {}
+            for candidate_premature_layer in candidate_premature_layers:
+                candidate_premature_logits[candidate_premature_layer] = lm_head(
+                    outputs.hidden_states[candidate_premature_layer][:, -1, :]
+                )
+
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+
+            next_token_logits = _dola_select_contrast(
+                candidate_premature_layers, candidate_premature_logits, final_logits
+            )
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            if do_sample:  # sample
+                next_token_scores = logits_warper(input_ids, next_token_scores)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (final_layer_next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            if do_sample:  # sample
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:  # argmax
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+
+            # stop when each sentence is finished
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            return input_ids
+
     @torch.no_grad()
     def _contrastive_search(
         self,
@@ -2234,7 +2511,7 @@ def _contrastive_search(
             # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
             # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
             if model_kwargs.get("past_key_values") is None or (
-                isinstance(model_kwargs["past_key_values"], Cache)
+                isinstance(model_kwargs["past_key_values"], (Cache, EncoderDecoderCache))
                 and model_kwargs["past_key_values"].get_seq_length() == 0
             ):
                 # prepare inputs
@@ -2323,7 +2600,9 @@ def _contrastive_search(
                 # Replicates the new past_key_values to match the `top_k` candidates
                 past = model_kwargs["past_key_values"]
                 # If it is a static cache, modify it in-place layer after layer to save memory
-                if isinstance(past, DynamicCache):
+                if isinstance(past, DynamicCache) or (
+                    isinstance(past, EncoderDecoderCache) and isinstance(past.self_attention_cache, DynamicCache)
+                ):
                     past.batch_repeat_interleave(top_k)
                 else:
                     new_key_values = []
@@ -2350,7 +2629,10 @@ def _contrastive_search(
                         output_hidden_states=True,
                         output_attentions=output_attentions,
                     )
-                    if isinstance(outputs["past_key_values"], DynamicCache):
+                    if isinstance(outputs["past_key_values"], DynamicCache) or (
+                        isinstance(outputs["past_key_values"], EncoderDecoderCache)
+                        and isinstance(outputs["past_key_values"].self_attention_cache, DynamicCache)
+                    ):
                         # Remove past K-V from output since we don't need to stack later
                         outputs["past_key_values"] = None
                         # Remove last token from past K-V since we don't want to append it at this point
@@ -2425,7 +2707,10 @@ def _contrastive_search(
             else:
                 _, next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True)
                 # Do it in-place layer per layer to save memory
-                if isinstance(next_past_key_values, DynamicCache):
+                if isinstance(next_past_key_values, DynamicCache) or (
+                    isinstance(next_past_key_values, EncoderDecoderCache)
+                    and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
+                ):
                     next_past_key_values.batch_select_indices(augmented_idx)
                 else:
                     new_key_values = []
@@ -2498,7 +2783,10 @@ def _contrastive_search(
             # Contrastive search works by forward looking at the next token, so we need to exclude it from
             # `past_key_values` to be consistent with the other decoding methods
             if model_kwargs.get("past_key_values") is not None:
-                if isinstance(model_kwargs["past_key_values"], DynamicCache):
+                if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
+                    isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
+                    and isinstance(model_kwargs["past_key_values"].self_attention_cache, DynamicCache)
+                ):
                     model_kwargs["past_key_values"].crop(-1)
                 else:
                     past_key_values = []
@@ -2647,13 +2935,12 @@ def _sample(
             # prepare model inputs
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
             # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
+            outputs = self(**model_inputs, return_dict=True)
 
             if synced_gpus and this_peer_finished:
                 continue  # don't waste resources running the code we don't need
@@ -2757,7 +3044,7 @@ def _temporary_reorder_cache(self, past_key_values, beam_idx):
         # Exception 2: models with different cache formats. These are limited to `DynamicCache` until their
         # cache format is standardized, to avoid adding complexity to the codebase.
         elif "bloom" in model_class or "gptbigcode" in model_class:
-            if not isinstance(past_key_values, DynamicCache):
+            if not isinstance(past_key_values, (DynamicCache, EncoderDecoderCache)):
                 raise ValueError(
                     f"Using an unsupported cache format with {model_class}. Currently, it only supports the "
                     "legacy tuple format or `DynamicCache`"
@@ -2874,6 +3161,10 @@ def _beam_search(
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
             # if sequential is True, split the input to batches of batch_size and run sequentially
             if sequential:
                 if any(
@@ -2899,24 +3190,13 @@ def _beam_search(
                     model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
                 )
                 outputs_per_sub_batch = [
-                    self(
-                        **inputs_per_sub_batch,
-                        return_dict=True,
-                        output_attentions=output_attentions,
-                        output_hidden_states=output_hidden_states,
-                    )
-                    for inputs_per_sub_batch in inputs_per_sub_batches
+                    self(**inputs_per_sub_batch, return_dict=True) for inputs_per_sub_batch in inputs_per_sub_batches
                 ]
 
                 outputs = stack_model_outputs(outputs_per_sub_batch)
 
             else:  # Unchanged original behavior
-                outputs = self(
-                    **model_inputs,
-                    return_dict=True,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                )
+                outputs = self(**model_inputs, return_dict=True)
 
             if synced_gpus and this_peer_finished:
                 cur_len = cur_len + 1
@@ -3196,12 +3476,12 @@ def _group_beam_search(
 
             # do one decoder step on all beams of all sentences in batch
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
+
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs, return_dict=True)
 
             if synced_gpus and this_peer_finished:
                 cur_len = cur_len + 1
@@ -3477,12 +3757,11 @@ def _constrained_beam_search(
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs, return_dict=True)
 
             if synced_gpus and this_peer_finished:
                 cur_len = cur_len + 1
@@ -3703,8 +3982,12 @@ def _assisted_decoding(
 
         # This is needed if return_dict_in_generate is True
         start_from_empty_dynamic_cache = False
-        if isinstance(model_kwargs.get("past_key_values", None), DynamicCache):
-            if len(model_kwargs["past_key_values"]) == 0:
+        past_key_values = model_kwargs.get("past_key_values", None)
+        if isinstance(past_key_values, DynamicCache) or (
+            isinstance(past_key_values, EncoderDecoderCache)
+            and isinstance(past_key_values.self_attention_cache, DynamicCache)
+        ):
+            if len(past_key_values) == 0:
                 start_from_empty_dynamic_cache = True
 
         this_peer_finished = False
@@ -3744,11 +4027,11 @@ def _assisted_decoding(
                 model_inputs["num_logits_to_keep"] = candidate_length + 1
 
             # 2.2. Run a forward pass on the candidate sequence
-            outputs = self(
-                **model_inputs,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs)
 
             # 2.3. Process the new logits
             new_logits = outputs.logits[:, -candidate_length - 1 :]  # excludes the input prompt if present
@@ -4022,7 +4305,9 @@ def _split(data, full_batch_size: int, split_size: int = None):
     if isinstance(data, torch.Tensor):
         return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
     # New cache format
-    elif isinstance(data, DynamicCache):
+    elif isinstance(data, DynamicCache) or (
+        isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
+    ):
         return data.batch_split(full_batch_size, split_size)
     elif isinstance(data, tuple):
         # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
@@ -4130,6 +4415,8 @@ def _concat(data):
         # New cache format
         elif isinstance(data[0], DynamicCache):
             return DynamicCache.from_batch_splits(data)
+        elif isinstance(data[0], EncoderDecoderCache):
+            return EncoderDecoderCache.from_batch_splits(data)
         elif isinstance(data[0], tuple):
             # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
             if isinstance(data[0][0], tuple):
@@ -4153,3 +4440,75 @@ def _concat(data):
 
     # Return a new object of the inferred class with the concatenated attributes
     return model_output_cls(**concatenated_data)
+
+
+def _relative_top_filter(
+    scores: torch.FloatTensor,
+    baseline_scores: torch.FloatTensor,
+    relative_top: float = 0.1,
+    filter_value: float = -float("Inf"),
+    base_filter_value=-1e-3,
+    min_tokens_to_keep: int = 1,
+) -> torch.FloatTensor:
+    """
+    Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
+    Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
+    """
+    scores_normalized = scores.log_softmax(dim=-1)
+    baseline_scores_normalized = baseline_scores.log_softmax(dim=-1)
+    sorted_logits, sorted_indices = torch.sort(scores_normalized, descending=True)
+    min_thresh = sorted_logits[..., min_tokens_to_keep - 1]
+    probs_max = torch.max(scores_normalized, dim=-1).values
+    probs_thresh = probs_max + np.log(relative_top)
+    probs_thresh = torch.min(min_thresh, probs_thresh)
+    probs_thresh = probs_thresh.unsqueeze(-1)
+    baseline_scores_normalized[scores_normalized < probs_thresh] = base_filter_value
+    scores_normalized[scores_normalized < probs_thresh] = filter_value
+    return scores_normalized, baseline_scores_normalized
+
+
+def _dola_select_contrast(
+    candidate_premature_layers: List[int],
+    candidate_premature_logits: Dict[int, torch.FloatTensor],
+    final_logits: torch.FloatTensor,
+) -> torch.FloatTensor:
+    if len(candidate_premature_layers) == 1:
+        base_logits = candidate_premature_logits[candidate_premature_layers[0]]
+        final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
+        logits = final_logits - base_logits
+        return logits
+
+    # 1. Stacking all premature_layers into a new dimension
+    stacked_premature_layers = torch.stack([candidate_premature_logits[i] for i in candidate_premature_layers], dim=0)
+
+    # 2. Calculate the softmax values for mature_layer and all premature_layers
+    # shape: (batch_size, vocab_size)
+    softmax_mature_layer = F.softmax(final_logits, dim=-1)
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    softmax_premature_layers = F.softmax(stacked_premature_layers, dim=-1)
+
+    # 3. Calculate the average distribution
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    avg_dist = 0.5 * (softmax_mature_layer[None, :, :] + softmax_premature_layers)
+
+    # 4. Calculate log-softmax for the KL divergence
+    # shape: (batch_size, vocab_size)
+    log_softmax_mature_layer = F.log_softmax(final_logits, dim=-1)
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    log_softmax_premature_layers = F.log_softmax(stacked_premature_layers, dim=-1)
+
+    # 5. Calculate the KL divergences and then the JS divergences
+    # shape: (num_premature_layers, batch_size)
+    kl1 = F.kl_div(log_softmax_mature_layer[None, :, :], avg_dist, reduction="none").mean(-1)
+    # shape: (num_premature_layers, batch_size)
+    kl2 = F.kl_div(log_softmax_premature_layers, avg_dist, reduction="none").mean(-1)
+    js_divs = 0.5 * (kl1 + kl2)  # shape: (num_premature_layers, batch_size)
+
+    # 6. Reduce the batchmean
+    js_divs = js_divs.mean(-1)  # shape: (num_premature_layers,)
+    premature_layer = candidate_premature_layers[int(js_divs.argmax().cpu().item())]
+
+    base_logits = candidate_premature_logits[premature_layer]
+    final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
+    logits = final_logits - base_logits
+    return logits
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index a5d8f6f872aabd..0a1a73f5c63e11 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -409,22 +409,22 @@ def validate_preprocess_arguments(
 
     """
     if do_rescale and rescale_factor is None:
-        raise ValueError("rescale_factor must be specified if do_rescale is True.")
+        raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
 
     if do_pad and size_divisibility is None:
         # Here, size_divisor might be passed as the value of size
         raise ValueError(
-            "Depending on moel, size_divisibility, size_divisor, pad_size or size must be specified if do_pad is True."
+            "Depending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`."
         )
 
     if do_normalize and (image_mean is None or image_std is None):
-        raise ValueError("image_mean and image_std must both be specified if do_normalize is True.")
+        raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
 
     if do_center_crop and crop_size is None:
-        raise ValueError("crop_size must be specified if do_center_crop is True.")
+        raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
 
     if do_resize and (size is None or resample is None):
-        raise ValueError("size and resample must be specified if do_resize is True.")
+        raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
 
 
 # In the future we can add a TF implementation here when we have TF models.
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 29528feb515cbd..e9c91192ecf9a2 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -51,19 +51,25 @@
     import torch
 
 # comet_ml requires to be imported before any ML frameworks
-_has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"
-if _has_comet:
-    try:
-        import comet_ml  # noqa: F401
+_MIN_COMET_VERSION = "3.43.2"
+try:
+    _comet_version = importlib.metadata.version("comet_ml")
+    _is_comet_installed = True
 
-        if hasattr(comet_ml, "config") and comet_ml.config.get_config("comet.api_key"):
-            _has_comet = True
-        else:
-            if os.getenv("COMET_MODE", "").upper() != "DISABLED":
-                logger.warning("comet_ml is installed but `COMET_API_KEY` is not set.")
-            _has_comet = False
-    except (ImportError, ValueError):
-        _has_comet = False
+    _is_comet_recent_enough = packaging.version.parse(_comet_version) >= packaging.version.parse(_MIN_COMET_VERSION)
+
+    # Check if the Comet API Key is set
+    import comet_ml
+
+    if comet_ml.config.get_config("comet.api_key") is not None:
+        _is_comet_configured = True
+    else:
+        _is_comet_configured = False
+except (importlib.metadata.PackageNotFoundError, ImportError, ValueError, TypeError, AttributeError, KeyError):
+    _comet_version = None
+    _is_comet_installed = False
+    _is_comet_recent_enough = False
+    _is_comet_configured = False
 
 _has_neptune = (
     importlib.util.find_spec("neptune") is not None or importlib.util.find_spec("neptune-client") is not None
@@ -103,7 +109,36 @@ def is_clearml_available():
 
 
 def is_comet_available():
-    return _has_comet
+    if os.getenv("COMET_MODE", "").upper() == "DISABLED":
+        logger.warning(
+            "Using the `COMET_MODE=DISABLED` environment variable is deprecated and will be removed in v5. Use the "
+            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
+        )
+        return False
+
+    if _is_comet_installed is False:
+        return False
+
+    if _is_comet_recent_enough is False:
+        logger.warning(
+            "comet_ml version %s is installed, but version %s or higher is required. "
+            "Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=%s'.",
+            _comet_version,
+            _MIN_COMET_VERSION,
+            _MIN_COMET_VERSION,
+        )
+        return False
+
+    if _is_comet_configured is False:
+        logger.warning(
+            "comet_ml is installed but the Comet API Key is not configured. "
+            "Please set the `COMET_API_KEY` environment variable to enable Comet logging. "
+            "Check out the documentation for other ways of configuring it: "
+            "https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key"
+        )
+        return False
+
+    return True
 
 
 def is_tensorboard_available():
@@ -936,56 +971,109 @@ def on_predict(self, args, state, control, metrics, **kwargs):
 
 class CometCallback(TrainerCallback):
     """
-    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
+    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
     """
 
     def __init__(self):
-        if not _has_comet:
-            raise RuntimeError("CometCallback requires comet-ml to be installed. Run `pip install comet-ml`.")
+        if _is_comet_installed is False or _is_comet_recent_enough is False:
+            raise RuntimeError(
+                f"CometCallback requires comet-ml>={_MIN_COMET_VERSION} to be installed. Run `pip install comet-ml>={_MIN_COMET_VERSION}`."
+            )
         self._initialized = False
         self._log_assets = False
+        self._experiment = None
 
     def setup(self, args, state, model):
         """
-        Setup the optional Comet.ml integration.
+        Setup the optional Comet integration.
 
         Environment:
-        - **COMET_MODE** (`str`, *optional*, defaults to `ONLINE`):
-            Whether to create an online, offline experiment or disable Comet logging. Can be `OFFLINE`, `ONLINE`, or
-            `DISABLED`.
+        - **COMET_MODE** (`str`, *optional*, default to `get_or_create`):
+            Control whether to create and log to a new Comet experiment or append to an existing experiment.
+            It accepts the following values:
+                * `get_or_create`: Decides automatically depending if
+                  `COMET_EXPERIMENT_KEY` is set and whether an Experiment
+                  with that key already exists or not.
+                * `create`: Always create a new Comet Experiment.
+                * `get`: Always try to append to an Existing Comet Experiment.
+                  Requires `COMET_EXPERIMENT_KEY` to be set.
+                * `ONLINE`: **deprecated**, used to create an online
+                  Experiment. Use `COMET_START_ONLINE=1` instead.
+                * `OFFLINE`: **deprecated**, used to created an offline
+                  Experiment. Use `COMET_START_ONLINE=0` instead.
+                * `DISABLED`: **deprecated**, used to disable Comet logging.
+                  Use the `--report_to` flag to control the integrations used
+                  for logging result instead.
         - **COMET_PROJECT_NAME** (`str`, *optional*):
             Comet project name for experiments.
-        - **COMET_OFFLINE_DIRECTORY** (`str`, *optional*):
-            Folder to use for saving offline experiments when `COMET_MODE` is `OFFLINE`.
         - **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
             Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
             `FALSE`.
 
         For a number of configurable items in the environment, see
-        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
+        [here](https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options).
         """
         self._initialized = True
         log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
         if log_assets in {"TRUE", "1"}:
             self._log_assets = True
         if state.is_world_process_zero:
-            comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
-            experiment = None
-            experiment_kwargs = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
-            if comet_mode == "ONLINE":
-                experiment = comet_ml.Experiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml online logging enabled")
-            elif comet_mode == "OFFLINE":
-                experiment_kwargs["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
-                experiment = comet_ml.OfflineExperiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
-            if experiment is not None:
-                experiment._set_model_graph(model, framework="transformers")
-                experiment._log_parameters(args, prefix="args/", framework="transformers")
-                if hasattr(model, "config"):
-                    experiment._log_parameters(model.config, prefix="config/", framework="transformers")
+            comet_old_mode = os.getenv("COMET_MODE")
+
+            mode = None
+            online = None
+
+            if comet_old_mode is not None:
+                comet_old_mode = comet_old_mode.lower()
+
+                if comet_old_mode == "online":
+                    online = True
+                elif comet_old_mode == "offline":
+                    online = False
+                elif comet_old_mode in ("get", "get_or_create", "create"):
+                    mode = comet_old_mode
+                elif comet_old_mode:
+                    logger.warning("Invalid COMET_MODE env value %r, Comet logging is disabled", comet_old_mode)
+                    return
+
+            # For HPO, we always create a new experiment for each trial
+            if state.is_hyper_param_search:
+                if mode is not None:
+                    logger.warning(
+                        "Hyperparameter Search is enabled, forcing the creation of new experimetns, COMET_MODE value %r  is ignored",
+                        comet_old_mode,
+                    )
+                mode = "create"
+
+            import comet_ml
+
+            # Do not use the default run_name as the experiment name
+            if args.run_name is not None and args.run_name != args.output_dir:
+                experiment_config = comet_ml.ExperimentConfig(name=args.run_name)
+            else:
+                experiment_config = comet_ml.ExperimentConfig()
+
+            self._experiment = comet_ml.start(online=online, mode=mode, experiment_config=experiment_config)
+            self._experiment.__internal_api__set_model_graph__(model, framework="transformers")
+
+            params = {"args": args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                params["config"] = model_config
+            if hasattr(model, "peft_config") and model.peft_config is not None:
+                peft_config = model.peft_config
+                params["peft_config"] = peft_config
+
+            self._experiment.__internal_api__log_parameters__(
+                params, framework="transformers", source="manual", flatten_nested=True
+            )
+
+            if state.is_hyper_param_search:
+                optimization_id = getattr(state, "trial_name", None)
+                optimization_params = getattr(state, "trial_params", None)
+
+                self._experiment.log_optimization(optimization_id=optimization_id, parameters=optimization_params)
 
     def on_train_begin(self, args, state, control, model=None, **kwargs):
         if not self._initialized:
@@ -995,20 +1083,24 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs):
         if not self._initialized:
             self.setup(args, state, model)
         if state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
-                experiment._log_metrics(logs, step=state.global_step, epoch=state.epoch, framework="transformers")
+            if self._experiment is not None:
+                self._experiment.__internal_api__log_metrics__(
+                    logs, step=state.global_step, epoch=state.epoch, framework="transformers"
+                )
 
     def on_train_end(self, args, state, control, **kwargs):
         if self._initialized and state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
+            if self._experiment is not None:
                 if self._log_assets is True:
                     logger.info("Logging checkpoints. This may take time.")
-                    experiment.log_asset_folder(
+                    self._experiment.log_asset_folder(
                         args.output_dir, recursive=True, log_file_name=True, step=state.global_step
                     )
-                experiment.end()
+
+            # We create one experiment per trial in HPO mode
+            if state.is_hyper_param_search:
+                self._experiment.clean()
+                self._initialized = False
 
 
 class AzureMLCallback(TrainerCallback):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c991c1c95ba24b..b37f968ae92625 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1942,8 +1942,8 @@ def resize_token_embeddings(
         # Update base model and current model config
         if hasattr(self.config, "text_config"):
             self.config.text_config.vocab_size = model_embeds.weight.shape[0]
-        # TODO: to be removed after v4.42, config.vocab_size is deprecated for models that have a config.text_config
-        self.config.vocab_size = model_embeds.weight.shape[0]
+        else:
+            self.config.vocab_size = model_embeds.weight.shape[0]
         self.vocab_size = model_embeds.weight.shape[0]
 
         # Tie weights again if needed
@@ -2518,9 +2518,11 @@ def save_pretrained(
 
         # Save the model
         if state_dict is None:
-            # if any model parameters are offloaded to the disk, make module map
-            if hasattr(self, "hf_device_map") and (
-                "cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values()
+            # if any model parameters are offloaded, make module map
+            if (
+                hasattr(self, "hf_device_map")
+                and len(set(self.hf_device_map.values())) > 1
+                and ("cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values())
             ):
                 warnings.warn(
                     "Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)"
@@ -2532,7 +2534,6 @@ def save_pretrained(
 
                     for key in module_state_dict:
                         module_map[name + f".{key}"] = module
-
             state_dict = model_to_save.state_dict()
 
         # Translate state_dict from smp to hf if saving with smp >= 1.10
@@ -2655,9 +2656,11 @@ def save_pretrained(
                 and reg.fullmatch(filename_no_suffix) is not None
             ):
                 os.remove(full_filename)
-
         # Save the model
-        for shard_file, tensors in state_dict_split.filename_to_tensors.items():
+        filename_to_tensors = state_dict_split.filename_to_tensors.items()
+        if module_map:
+            filename_to_tensors = logging.tqdm(filename_to_tensors, desc="Saving checkpoint shards")
+        for shard_file, tensors in filename_to_tensors:
             shard = {tensor: state_dict[tensor] for tensor in tensors}
             # remake shard with onloaded parameters if necessary
             if module_map:
@@ -2667,15 +2670,15 @@ def save_pretrained(
                         f"Please upgrade accelerate with `pip install -U accelerate`"
                     )
                 # init state_dict for this shard
-                state_dict = {name: "" for name in shard}
+                shard_state_dict = {name: "" for name in shard}
                 for module_name in shard:
                     module = module_map[module_name]
                     # update state dict with onloaded parameters
-                    state_dict = get_state_dict_from_offload(module, module_name, state_dict)
+                    shard_state_dict = get_state_dict_from_offload(module, module_name, shard_state_dict)
 
                 # assign shard to be the completed state dict
-                shard = state_dict
-                del state_dict
+                shard = shard_state_dict
+                del shard_state_dict
                 gc.collect()
 
             if safe_serialization:
@@ -2829,7 +2832,7 @@ def from_pretrained(
         revision: str = "main",
         use_safetensors: bool = None,
         **kwargs,
-    ):
+    ) -> "PreTrainedModel":
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index f4c33491472833..043c02a8d3f5ca 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -263,4 +263,5 @@
     xmod,
     yolos,
     yoso,
+    zoedepth,
 )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 7f52b3dc280ac6..e1aa4fb7151fe3 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -291,6 +291,7 @@
         ("xmod", "XmodConfig"),
         ("yolos", "YolosConfig"),
         ("yoso", "YosoConfig"),
+        ("zoedepth", "ZoeDepthConfig"),
     ]
 )
 
@@ -356,6 +357,7 @@
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
         ("depth_anything", "Depth Anything"),
+        ("depth_anything_v2", "Depth Anything V2"),
         ("deta", "DETA"),
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
@@ -588,6 +590,7 @@
         ("xmod", "X-MOD"),
         ("yolos", "YOLOS"),
         ("yoso", "YOSO"),
+        ("zoedepth", "ZoeDepth"),
     ]
 )
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index efc2d4d998ccdd..8ad9a3034b64e0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -142,6 +142,7 @@
             ("vitmatte", ("VitMatteImageProcessor",)),
             ("xclip", ("CLIPImageProcessor",)),
             ("yolos", ("YolosImageProcessor",)),
+            ("zoedepth", ("ZoeDepthImageProcessor",)),
         ]
     )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index f674b777fca7be..8c4cea1539d55e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -792,6 +792,7 @@
         ("depth_anything", "DepthAnythingForDepthEstimation"),
         ("dpt", "DPTForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
+        ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
 )
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7c7342bb9fb7e7..631fee8f8cc444 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -59,6 +59,7 @@
         ("flava", "FlavaProcessor"),
         ("fuyu", "FuyuProcessor"),
         ("git", "GitProcessor"),
+        ("grounding-dino", "GroundingDinoProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("hubert", "Wav2Vec2Processor"),
         ("idefics", "IdeficsProcessor"),
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 184ab558228620..58b28866646091 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -34,7 +34,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -193,12 +193,6 @@ def forward(
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
-
         embeddings, (patch_height, patch_width) = self.patch_embeddings(
             pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
         )
@@ -280,6 +274,7 @@ def forward(
 class BeitSelfAttention(nn.Module):
     def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> None:
         super().__init__()
+        self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
@@ -313,6 +308,7 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["BeitRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
@@ -327,9 +323,11 @@ def forward(
 
         # Add relative position bias if present.
         if self.relative_position_bias is not None:
+            height, width = resolution
+            window_size = (height // self.config.patch_size, width // self.config.patch_size)
             attention_scores = attention_scores + self.relative_position_bias(
-                interpolate_pos_encoding, attention_scores.shape[2]
-            ).unsqueeze(0)
+                window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+            )
 
         # Add shared relative position bias if provided.
         if relative_position_bias is not None:
@@ -407,9 +405,10 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["BeitRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         self_outputs = self.attention(
-            hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding
+            hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding, resolution
         )
 
         attention_output = self.output(self_outputs[0], hidden_states)
@@ -475,6 +474,7 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["BeitRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         self_attention_outputs = self.attention(
             self.layernorm_before(hidden_states),  # in BEiT, layernorm is applied before self-attention
@@ -482,6 +482,7 @@ def forward(
             output_attentions=output_attentions,
             relative_position_bias=relative_position_bias,
             interpolate_pos_encoding=interpolate_pos_encoding,
+            resolution=resolution,
         )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
@@ -520,32 +521,71 @@ def __init__(self, config: BeitConfig, window_size: tuple) -> None:
         )  # 2*Wh-1 * 2*Ww-1, nH
         # cls to token & token 2 cls & cls to cls
 
+        self.relative_position_indices = {}
+
+    def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor:
+        """
+        This method creates the relative position index, modified to support arbitrary window sizes,
+        as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
+        """
+        num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        # cls to token & token 2 cls & cls to cls
         # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(window_size[0])
-        coords_w = torch.arange(window_size[1])
-        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))  # 2, Wh, Ww
+        window_area = window_size[0] * window_size[1]
+        grid = torch.meshgrid(torch.arange(window_size[0]), torch.arange(window_size[1]), indexing="ij")
+        coords = torch.stack(grid)  # 2, Wh, Ww
         coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
         relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
         relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
         relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
         relative_coords[:, :, 1] += window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = torch.zeros(
-            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
-        )
+        relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype)
         relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
+        relative_position_index[0, 0:] = num_relative_distance - 3
+        relative_position_index[0:, 0] = num_relative_distance - 2
+        relative_position_index[0, 0] = num_relative_distance - 1
+        return relative_position_index
+
+    def forward(self, window_size, interpolate_pos_encoding: bool = False, dim_size=None) -> torch.Tensor:
+        """
+        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
+        """
+        old_height = 2 * self.window_size[0] - 1
+        old_width = 2 * self.window_size[1] - 1
+
+        new_height = 2 * window_size[0] - 1
+        new_width = 2 * window_size[1] - 1
 
-        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
+        old_relative_position_bias_table = self.relative_position_bias_table
 
-    def forward(self, interpolate_pos_encoding: bool = False, dim_size: Optional[int] = None) -> torch.Tensor:
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1
-        )  # Wh*Ww,Wh*Ww,nH
+        old_num_relative_distance = self.num_relative_distance
+        new_num_relative_distance = new_height * new_width + 3
+
+        old_sub_table = old_relative_position_bias_table[: old_num_relative_distance - 3]
+
+        old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2)
+        new_sub_table = nn.functional.interpolate(
+            old_sub_table, size=(int(new_height), int(new_width)), mode="bilinear"
+        )
+        new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1)
+
+        new_relative_position_bias_table = torch.cat(
+            [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]]
+        )
+
+        key = window_size
+        if key not in self.relative_position_indices.keys():
+            self.relative_position_indices[key] = self.generate_relative_position_index(window_size)
+
+        relative_position_bias = new_relative_position_bias_table[self.relative_position_indices[key].view(-1)]
+        # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads
+        relative_position_bias = relative_position_bias.view(
+            window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1
+        )
+        # num_attention_heads, patch_size*num_patches_width, patch_size*num_patches_height
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
 
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
         if interpolate_pos_encoding:
             relative_position_bias = nn.functional.interpolate(
                 relative_position_bias.unsqueeze(1),
@@ -554,7 +594,7 @@ def forward(self, interpolate_pos_encoding: bool = False, dim_size: Optional[int
                 align_corners=False,
             ).squeeze(1)
 
-        return relative_position_bias
+        return relative_position_bias.unsqueeze(0)
 
 
 class BeitEncoder(nn.Module):
@@ -587,6 +627,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
         return_dict: bool = True,
     ) -> Union[tuple, BaseModelOutput]:
         all_hidden_states = () if output_hidden_states else None
@@ -606,13 +647,22 @@ def forward(
                     output_attentions,
                 )
             else:
+                height, width = resolution
+                window_size = (height // self.config.patch_size, width // self.config.patch_size)
                 relative_position_bias = (
-                    self.relative_position_bias(interpolate_pos_encoding, hidden_states.shape[1])
+                    self.relative_position_bias(
+                        window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+                    )
                     if self.relative_position_bias is not None
                     else None
                 )
                 layer_outputs = layer_module(
-                    hidden_states, layer_head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                    relative_position_bias,
+                    interpolate_pos_encoding,
+                    resolution,
                 )
 
             hidden_states = layer_outputs[0]
@@ -643,6 +693,7 @@ class BeitPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["BeitLayer"]
+    _keys_to_ignore_on_load_unexpected = [r".*relative_position_index.*"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -738,7 +789,7 @@ class PreTrainedModel
     )
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor,
         bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -756,9 +807,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -766,15 +814,17 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output, (patch_height, patch_width) = self.embeddings(
+        embedding_output, _ = self.embeddings(
             pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
         )
+        resolution = pixel_values.shape[2:]
 
         encoder_outputs = self.encoder(
             embedding_output,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            resolution=resolution,
             return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
@@ -1477,9 +1527,14 @@ def forward(
 
         batch_size = pixel_values.shape[0]
         embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values)
+        resolution = pixel_values.shape[2:]
 
         outputs = self.encoder(
-            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            resolution=resolution,
+            return_dict=return_dict,
         )
 
         hidden_states = outputs.hidden_states if return_dict else outputs[1]
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 87799899945fa6..4d1739ecf26172 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -89,7 +89,7 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
             )
 
         if text is not None and audios is not None:
-            encoding["input_features"] = audio_features.input_features
+            encoding.update(audio_features)
             return encoding
         elif text is not None:
             return encoding
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index a79810d0c5bb57..fca47c524e5146 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -32,7 +32,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -192,12 +192,6 @@ def forward(
         interpolate_pos_encoding: bool = False,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
-
         embeddings, (patch_height, patch_width) = self.patch_embeddings(
             pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
         )
@@ -281,6 +275,7 @@ def forward(
 class Data2VecVisionSelfAttention(nn.Module):
     def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
         super().__init__()
+        self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
@@ -314,6 +309,7 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
@@ -328,9 +324,11 @@ def forward(
 
         # Add relative position bias if present.
         if self.relative_position_bias is not None:
+            height, width = resolution
+            window_size = (height // self.config.patch_size, width // self.config.patch_size)
             attention_scores = attention_scores + self.relative_position_bias(
-                interpolate_pos_encoding, attention_scores.shape[2]
-            ).unsqueeze(0)
+                window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+            )
 
         # Add shared relative position bias if provided.
         if relative_position_bias is not None:
@@ -410,9 +408,10 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         self_outputs = self.attention(
-            hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding
+            hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding, resolution
         )
 
         attention_output = self.output(self_outputs[0], hidden_states)
@@ -483,6 +482,7 @@ def forward(
         output_attentions: bool = False,
         relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         self_attention_outputs = self.attention(
             self.layernorm_before(hidden_states),  # in Data2VecVision, layernorm is applied before self-attention
@@ -490,6 +490,7 @@ def forward(
             output_attentions=output_attentions,
             relative_position_bias=relative_position_bias,
             interpolate_pos_encoding=interpolate_pos_encoding,
+            resolution=resolution,
         )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
@@ -529,32 +530,71 @@ def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
         )  # 2*Wh-1 * 2*Ww-1, nH
         # cls to token & token 2 cls & cls to cls
 
+        self.relative_position_indices = {}
+
+    def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor:
+        """
+        This method creates the relative position index, modified to support arbitrary window sizes,
+        as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
+        """
+        num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        # cls to token & token 2 cls & cls to cls
         # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(window_size[0])
-        coords_w = torch.arange(window_size[1])
-        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))  # 2, Wh, Ww
+        window_area = window_size[0] * window_size[1]
+        grid = torch.meshgrid(torch.arange(window_size[0]), torch.arange(window_size[1]), indexing="ij")
+        coords = torch.stack(grid)  # 2, Wh, Ww
         coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
         relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
         relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
         relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
         relative_coords[:, :, 1] += window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = torch.zeros(
-            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
-        )
+        relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype)
         relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
+        relative_position_index[0, 0:] = num_relative_distance - 3
+        relative_position_index[0:, 0] = num_relative_distance - 2
+        relative_position_index[0, 0] = num_relative_distance - 1
+        return relative_position_index
+
+    def forward(self, window_size, interpolate_pos_encoding: bool = False, dim_size=None) -> torch.Tensor:
+        """
+        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
+        """
+        old_height = 2 * self.window_size[0] - 1
+        old_width = 2 * self.window_size[1] - 1
+
+        new_height = 2 * window_size[0] - 1
+        new_width = 2 * window_size[1] - 1
 
-        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
+        old_relative_position_bias_table = self.relative_position_bias_table
 
-    def forward(self, interpolate_pos_encoding: bool = False, dim_size: Optional[int] = None) -> torch.Tensor:
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1
-        )  # Wh*Ww,Wh*Ww,nH
+        old_num_relative_distance = self.num_relative_distance
+        new_num_relative_distance = new_height * new_width + 3
+
+        old_sub_table = old_relative_position_bias_table[: old_num_relative_distance - 3]
+
+        old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2)
+        new_sub_table = nn.functional.interpolate(
+            old_sub_table, size=(int(new_height), int(new_width)), mode="bilinear"
+        )
+        new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1)
+
+        new_relative_position_bias_table = torch.cat(
+            [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]]
+        )
+
+        key = window_size
+        if key not in self.relative_position_indices.keys():
+            self.relative_position_indices[key] = self.generate_relative_position_index(window_size)
+
+        relative_position_bias = new_relative_position_bias_table[self.relative_position_indices[key].view(-1)]
+        # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads
+        relative_position_bias = relative_position_bias.view(
+            window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1
+        )
+        # num_attention_heads, patch_size*num_patches_width, patch_size*num_patches_height
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
 
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
         if interpolate_pos_encoding:
             relative_position_bias = nn.functional.interpolate(
                 relative_position_bias.unsqueeze(1),
@@ -563,7 +603,7 @@ def forward(self, interpolate_pos_encoding: bool = False, dim_size: Optional[int
                 align_corners=False,
             ).squeeze(1)
 
-        return relative_position_bias
+        return relative_position_bias.unsqueeze(0)
 
 
 # Copied from transformers.models.beit.modeling_beit.BeitEncoder with Beit->Data2VecVision
@@ -597,6 +637,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         interpolate_pos_encoding: bool = False,
+        resolution: Optional[Tuple[int]] = None,
         return_dict: bool = True,
     ) -> Union[tuple, BaseModelOutput]:
         all_hidden_states = () if output_hidden_states else None
@@ -616,13 +657,22 @@ def forward(
                     output_attentions,
                 )
             else:
+                height, width = resolution
+                window_size = (height // self.config.patch_size, width // self.config.patch_size)
                 relative_position_bias = (
-                    self.relative_position_bias(interpolate_pos_encoding, hidden_states.shape[1])
+                    self.relative_position_bias(
+                        window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+                    )
                     if self.relative_position_bias is not None
                     else None
                 )
                 layer_outputs = layer_module(
-                    hidden_states, layer_head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                    relative_position_bias,
+                    interpolate_pos_encoding,
+                    resolution,
                 )
 
             hidden_states = layer_outputs[0]
@@ -654,6 +704,7 @@ class Data2VecVisionPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Data2VecVisionLayer"]
+    _keys_to_ignore_on_load_unexpected = [r".*relative_position_index.*"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -750,7 +801,7 @@ class PreTrainedModel
     )
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor,
         bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -768,9 +819,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -778,15 +826,17 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output, (patch_height, patch_width) = self.embeddings(
+        embedding_output, _ = self.embeddings(
             pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
         )
+        resolution = pixel_values.shape[2:]
 
         encoder_outputs = self.encoder(
             embedding_output,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            resolution=resolution,
             return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
index 9b9836e8522b3f..3e45c95de9abfa 100644
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
@@ -33,25 +33,28 @@
 
 def get_dpt_config(model_name):
     if "small" in model_name:
+        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
         backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-small", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
+            "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
         )
         fusion_hidden_size = 64
         neck_hidden_sizes = [48, 96, 192, 384]
     elif "base" in model_name:
+        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
         backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-base", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
+            "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
         )
         fusion_hidden_size = 128
         neck_hidden_sizes = [96, 192, 384, 768]
     elif "large" in model_name:
+        out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24]
         backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-large", out_indices=[21, 22, 23, 24], apply_layernorm=True, reshape_hidden_states=False
+            "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
         )
         fusion_hidden_size = 256
         neck_hidden_sizes = [256, 512, 1024, 1024]
     else:
-        raise NotImplementedError("To do")
+        raise NotImplementedError(f"Model not supported: {model_name}")
 
     config = DepthAnythingConfig(
         reassemble_hidden_size=backbone_config.hidden_size,
@@ -169,9 +172,13 @@ def prepare_img():
 
 
 name_to_checkpoint = {
-    "depth-anything-small": "depth_anything_vits14.pth",
-    "depth-anything-base": "depth_anything_vitb14.pth",
-    "depth-anything-large": "depth_anything_vitl14.pth",
+    "depth-anything-small": "pytorch_model.bin",
+    "depth-anything-base": "pytorch_model.bin",
+    "depth-anything-large": "pytorch_model.bin",
+    "depth-anything-v2-small": "depth_anything_v2_vits.pth",
+    "depth-anything-v2-base": "depth_anything_v2_vitb.pth",
+    "depth-anything-v2-large": "depth_anything_v2_vitl.pth",
+    # v2-giant pending
 }
 
 
@@ -184,17 +191,23 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
     # define DPT configuration
     config = get_dpt_config(model_name)
 
-    model_name_to_filename = {
-        "depth-anything-small": "depth_anything_vits14.pth",
-        "depth-anything-base": "depth_anything_vitb14.pth",
-        "depth-anything-large": "depth_anything_vitl14.pth",
+    model_name_to_repo = {
+        "depth-anything-small": "LiheYoung/depth_anything_vits14",
+        "depth-anything-base": "LiheYoung/depth_anything_vitb14",
+        "depth-anything-large": "LiheYoung/depth_anything_vitl14",
+        "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
+        "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
+        "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
     }
 
     # load original state_dict
-    filename = model_name_to_filename[model_name]
+    repo_id = model_name_to_repo[model_name]
+    filename = name_to_checkpoint[model_name]
     filepath = hf_hub_download(
-        repo_id="LiheYoung/Depth-Anything", filename=f"checkpoints/{filename}", repo_type="space"
+        repo_id=repo_id,
+        filename=f"{filename}",
     )
+
     state_dict = torch.load(filepath, map_location="cpu")
     # rename keys
     rename_keys = create_rename_keys(config)
@@ -247,11 +260,23 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
             expected_slice = torch.tensor(
                 [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]]
             )
+        elif model_name == "depth-anything-v2-small":
+            expected_slice = torch.tensor(
+                [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]]
+            )
+        elif model_name == "depth-anything-v2-base":
+            expected_slice = torch.tensor(
+                [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]]
+            )
+        elif model_name == "depth-anything-v2-large":
+            expected_slice = torch.tensor(
+                [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
+            )
         else:
             raise ValueError("Not supported")
 
         assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6)
+        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
         print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
@@ -262,8 +287,8 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
 
     if push_to_hub:
         print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
-        processor.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
+        model.push_to_hub(repo_id=f"{model_name.title()}-hf")
+        processor.push_to_hub(repo_id=f"{model_name.title()}-hf")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 96f43a796e3886..a4e3da1528ec0b 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -58,7 +58,7 @@ def get_resize_output_image_size(
     multiple: int,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
-    def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
+    def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
         x = round(val / multiple) * multiple
 
         if max_val is not None and x > max_val:
@@ -87,8 +87,8 @@ def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
             # fit height
             scale_width = scale_height
 
-    new_height = constraint_to_multiple_of(scale_height * input_height, multiple=multiple)
-    new_width = constraint_to_multiple_of(scale_width * input_width, multiple=multiple)
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
 
     return (new_height, new_width)
 
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index db5db0eae1189b..b2b88855669a76 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -1021,7 +1021,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_wi
 
 class DPTDepthEstimationHead(nn.Module):
     """
-    Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
     the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
     supplementary material).
     """
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index 6cf666d7ee2ae2..ffcdd2b61750a6 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """Fuyu model configuration"""
 
+import warnings
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING
@@ -157,7 +159,7 @@ def __init__(
         text_model_type = text_config["model_type"] if "model_type" in text_config else "persimmon"
         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
-        self.vocab_size = vocab_size
+        self._vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.image_size = image_size
         self.patch_size = patch_size
@@ -206,3 +208,20 @@ def _rope_scaling_validation(self):
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
             raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+    @property
+    def vocab_size(self):
+        warnings.warn(
+            "The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
+            FutureWarning,
+        )
+        return self._vocab_size
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self._vocab_size = value
+
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_vocab_size", None)
+        return output
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index e716e9f33488c9..0885bb9e1a4a2d 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -149,7 +149,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel):
     def __init__(self, config: FuyuConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
+        self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(
             config.text_config, attn_implementation=config._attn_implementation
         )
@@ -168,6 +168,15 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
 
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        # TODO: config.vocab_size is deprecated and will be removed in v4.43.
+        # `resize_token_embeddings` should work from `modeling_utils.py``
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
     def gather_continuous_embeddings(
         self,
         word_embeddings: torch.Tensor,
diff --git a/src/transformers/models/gemma2/__init__.py b/src/transformers/models/gemma2/__init__.py
index 0d0aa148be5e33..ce59dfd8c7ac5a 100644
--- a/src/transformers/models/gemma2/__init__.py
+++ b/src/transformers/models/gemma2/__init__.py
@@ -39,7 +39,7 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_gemma import Gemma2Config
+    from .configuration_gemma2 import Gemma2Config
 
     try:
         if not is_torch_available():
@@ -47,7 +47,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_gemma import (
+        from .modeling_gemma2 import (
             Gemma2ForCausalLM,
             Gemma2ForSequenceClassification,
             Gemma2ForTokenClassification,
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index c8c6dbe86e01f7..2857e366d29e1c 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -221,7 +221,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
         batch_size, q_len, _ = hidden_states.size()
@@ -1344,9 +1344,6 @@ class Idefics2PreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
 
     def _init_weights(self, module):
-        # important: this ported version of Idefics2 isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/idefics2 should serve for that purpose
         std = (
             self.config.text_config.initializer_range
             if hasattr(self.config, "initializer_range")
diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index 04430ada87a04c..aa1bec59f5cadd 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -545,7 +545,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs,  # `attention_mask` is passed by the tokenizer and we don't want it
     ) -> Union[Tuple, MambaOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -673,7 +672,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
-        **kwargs,  # for now we need this for generation
     ) -> Union[Tuple, MambaCausalLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index 475dda72c59295..9747cbe6fb98ae 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -1089,8 +1089,9 @@ def _update_causal_mask(
             exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             if self.config.sliding_window is not None:
                 if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
-                    exclude_mask |= torch.arange(target_length, device=device) <= (
-                        cache_position.reshape(-1, 1) - self.config.sliding_window
+                    exclude_mask.bitwise_or_(
+                        torch.arange(target_length, device=device)
+                        <= (cache_position.reshape(-1, 1) - self.config.sliding_window)
                     )
             causal_mask *= exclude_mask
             causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index d092142476c8c9..7ba3e008c42c78 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """PaliGemmamodel configuration"""
 
+import warnings
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING
@@ -86,7 +88,7 @@ def __init__(
     ):
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
-        self.vocab_size = vocab_size
+        self._vocab_size = vocab_size
         self.projection_dim = projection_dim
         self.hidden_size = hidden_size
         self.vision_config = vision_config
@@ -124,7 +126,25 @@ def __init__(
                 num_attention_heads=8,
                 num_key_value_heads=1,
                 is_encoder_decoder=False,
+                vocab_size=vocab_size,
             )
         self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
         self.vision_config.projection_dim = projection_dim
         super().__init__(**kwargs)
+
+    @property
+    def vocab_size(self):
+        warnings.warn(
+            "The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
+            FutureWarning,
+        )
+        return self._vocab_size
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self._vocab_size = value
+
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_vocab_size", None)
+        return output
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 9f5bc0c5975d45..a640e7c7465abd 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -233,7 +233,7 @@ def __init__(self, config: PaliGemmaConfig):
         super().__init__(config)
         self.vision_tower = AutoModel.from_config(config=config.vision_config)
         self.multi_modal_projector = PaliGemmaMultiModalProjector(config)
-        self.vocab_size = config.vocab_size
+        self.vocab_size = config.text_config.vocab_size
         self._attn_implementation = config._attn_implementation
 
         language_model = AutoModelForCausalLM.from_config(
@@ -276,8 +276,9 @@ def tie_weights(self):
         return self.language_model.tie_weights()
 
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        # TODO: config.vocab_size is deprecated and will be removed in v4.43.
+        # `resize_token_embeddings` should work from `modeling_utils.py``
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index 2a8e1c25f6382c..40032851bfdc51 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -684,7 +684,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs,
     ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -823,7 +822,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
-        **kwargs,  # for now we need this for generation
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
         Args:
diff --git a/src/transformers/models/rt_detr/configuration_rt_detr.py b/src/transformers/models/rt_detr/configuration_rt_detr.py
index d0f4bb17562b3a..0e34d0376f9fa6 100644
--- a/src/transformers/models/rt_detr/configuration_rt_detr.py
+++ b/src/transformers/models/rt_detr/configuration_rt_detr.py
@@ -37,6 +37,9 @@ class RTDetrConfig(PretrainedConfig):
     Args:
         initializer_range (`float`, *optional*, defaults to 0.01):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_bias_prior_prob (`float`, *optional*):
+            The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+            If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
         layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -179,6 +182,7 @@ class RTDetrConfig(PretrainedConfig):
     def __init__(
         self,
         initializer_range=0.01,
+        initializer_bias_prior_prob=None,
         layer_norm_eps=1e-5,
         batch_norm_eps=1e-5,
         # backbone
@@ -239,6 +243,7 @@ def __init__(
         **kwargs,
     ):
         self.initializer_range = initializer_range
+        self.initializer_bias_prior_prob = initializer_bias_prior_prob
         self.layer_norm_eps = layer_norm_eps
         self.batch_norm_eps = batch_norm_eps
         # backbone
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 26cf843357c641..850b8dc2f627b3 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -1148,14 +1148,27 @@ class RTDetrPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initalize the weights"""
 
-        """initialize conv/fc bias value according to a given probability value."""
-        if isinstance(module, nn.Linear) and hasattr(module, "class_embed"):
-            prior_prob = self.config.initializer_range
+        """initialize linear layer bias value according to a given probability value."""
+        if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)):
+            if module.class_embed is not None:
+                for layer in module.class_embed:
+                    prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+                    bias = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.xavier_uniform_(layer.weight)
+                    nn.init.constant_(layer.bias, bias)
+
+            if module.bbox_embed is not None:
+                for layer in module.bbox_embed:
+                    nn.init.constant_(layer.layers[-1].weight, 0)
+                    nn.init.constant_(layer.layers[-1].bias, 0)
+
+        if isinstance(module, RTDetrModel):
+            prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
             bias = float(-math.log((1 - prior_prob) / prior_prob))
-            nn.init.xavier_uniform_(module.weight)
-            if module.bias is not None:
-                nn.init.constant_(module.bias, bias)
-        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            nn.init.xavier_uniform_(module.enc_score_head.weight)
+            nn.init.constant_(module.enc_score_head.bias, bias)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -1656,7 +1669,11 @@ def unfreeze_backbone(self):
             param.requires_grad_(True)
 
     @lru_cache(maxsize=32)
-    def generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=torch.float32, device="cpu"):
+    def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
+        # We always generate anchors in float32 to preserve equivalence between
+        # dynamic and static anchor inference
+        dtype = torch.float32
+
         if spatial_shapes is None:
             spatial_shapes = [
                 [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
@@ -1674,7 +1691,7 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=torch.floa
             anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
         # define the valid range for anchor coordinates
         eps = 1e-2
-        anchors = torch.concat(anchors, 1).to(device)
+        anchors = torch.concat(anchors, 1)
         valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
         anchors = torch.log(anchors / (1 - anchors))
         anchors = torch.where(valid_mask, anchors, torch.inf)
@@ -1769,15 +1786,15 @@ def forward(
 
         # Prepare encoder inputs (by flattening)
         source_flatten = []
-        spatial_shapes = []
+        spatial_shapes_list = []
         for level, source in enumerate(sources):
             batch_size, num_channels, height, width = source.shape
             spatial_shape = (height, width)
-            spatial_shapes.append(spatial_shape)
+            spatial_shapes_list.append(spatial_shape)
             source = source.flatten(2).transpose(1, 2)
             source_flatten.append(source)
         source_flatten = torch.cat(source_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
 
         # prepare denoising training
@@ -1805,9 +1822,14 @@ def forward(
 
         # prepare input for decoder
         if self.training or self.config.anchor_image_size is None:
-            anchors, valid_mask = self.generate_anchors(spatial_shapes, device=device, dtype=dtype)
+            # Pass spatial_shapes as tuple to make it hashable and make sure
+            # lru_cache is working for generate_anchors()
+            spatial_shapes_tuple = tuple(spatial_shapes_list)
+            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple)
         else:
-            anchors, valid_mask = self.anchors.to(device, dtype), self.valid_mask.to(device, dtype)
+            anchors, valid_mask = self.anchors, self.valid_mask
+
+        anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
 
         # use the valid_mask to selectively retain values in the feature map where the mask is `True`
         memory = valid_mask.to(source_flatten.dtype) * source_flatten
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 4c534bbce6ce8a..50d41ef509a8c2 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@@ -34,12 +35,19 @@
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 
 
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
 logger = logging.get_logger(__name__)
 
 # General docstring
@@ -47,6 +55,19 @@
 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
 
 
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
 def _trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
@@ -373,7 +394,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
         batch_size, q_len, _ = hidden_states.size()
@@ -421,6 +442,266 @@ def forward(
         return attn_output, attn_weights
 
 
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    SiglipAttention flash attention module. This module inherits from `SiglipAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    is_causal = False
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class SiglipSdpaAttention(SiglipAttention):
+    """
+    Siglip attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `SiglipAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    is_causal = False
+
+    # Adapted from SiglipAttention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "SiglipModel is using SiglipSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+            )
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if self.is_causal and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None
+
+
+SIGLIP_ATTENTION_CLASSES = {
+    "eager": SiglipAttention,
+    "flash_attention_2": SiglipFlashAttention2,
+    "sdpa": SiglipSdpaAttention,
+}
+
+
 # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
 class SiglipMLP(nn.Module):
     def __init__(self, config):
@@ -437,12 +718,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
 class SiglipEncoderLayer(nn.Module):
     def __init__(self, config: SiglipConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = SiglipAttention(config)
+        self.self_attn = SIGLIP_ATTENTION_CLASSES[config._attn_implementation](config=config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -503,6 +783,8 @@ class SiglipPreTrainedModel(PreTrainedModel):
         "SiglipEncoderLayer",
         "SiglipMultiheadAttentionPoolingHead",
     ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -754,6 +1036,7 @@ def __init__(self, config: SiglipTextConfig):
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
         self.head = nn.Linear(embed_dim, embed_dim)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
     @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
@@ -786,7 +1069,7 @@ def forward(
 
         # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
         # expand attention_mask
-        if attention_mask is not None:
+        if attention_mask is not None and not self._use_flash_attention_2:
             # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
             attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
 
@@ -1041,8 +1324,13 @@ def __init__(self, config: SiglipConfig):
         text_config = config.text_config
         vision_config = config.vision_config
 
-        self.text_model = SiglipTextTransformer(text_config)
-        self.vision_model = SiglipVisionTransformer(vision_config)
+        # First, initialize the text and vision models with proper attention implementation
+        text_model = SiglipTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
+        vision_model = SiglipVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
+
+        # Second, get the text and vision submodules (for backward compatibility)
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
 
         self.logit_scale = nn.Parameter(torch.randn(1))
         self.logit_bias = nn.Parameter(torch.randn(1))
@@ -1234,7 +1522,12 @@ def forward(
 
         loss = None
         if return_loss:
-            raise NotImplementedError("SigLIP loss to be implemented")
+            # Adapted from https://github.com/google-research/big_vision/blob/01edb81a4716f93a48be43b3a4af14e29cdb3a7f/big_vision/trainers/proj/image_text/siglip.py#L287
+            eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
+            m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
+            loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
+            nll = -torch.sum(loglik, dim=-1)
+            loss = nll.mean()
 
         if not return_dict:
             output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
@@ -1265,7 +1558,13 @@ def __init__(self, config: SiglipConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.vision_model = SiglipVisionTransformer(config.vision_config)
+
+        # Create the vision model with proper attention
+        # and take only vision_model submodule (for backward compatibility)
+        vision_model = SiglipVisionModel._from_config(
+            config.vision_config, attn_implementation=config._attn_implementation
+        )
+        self.vision_model = vision_model.vision_model
 
         # Classifier head
         self.classifier = (
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index b1c0931dcf7040..b540576d48669c 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -129,9 +129,6 @@ class VideoLlavaPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["VideoLlavaVisionAttention"]
 
     def _init_weights(self, module):
-        # important: this ported version of VideoLlava isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/video_llava should serve for that purpose
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 0a6aff8c819922..5613f83a86b4e7 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -94,8 +94,17 @@ def set_recursively(key, value, full_name, weight_type, hf_pointer):
             hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
             weight_type = "param"
 
+    # fairseq uses nn.utils.weight_norm() while transformers switches to nn.utils.parametrizations.weight_norm()
+    # the mapping between two versions:
+    # https://github.com/pytorch/pytorch/blob/56935684c3dfad7841c83c719eeebecb560fe466/torch/nn/utils/parametrizations.py#L389-L395
+
     if weight_type is not None and weight_type != "param":
-        hf_shape = getattr(hf_pointer, weight_type).shape
+        if weight_type == "weight_g" and not hasattr(hf_pointer, "weight_g"):
+            hf_shape = hf_pointer.parametrizations.weight.original0.shape
+        elif weight_type == "weight_v" and not hasattr(hf_pointer, "weight_v"):
+            hf_shape = hf_pointer.parametrizations.weight.original1.shape
+        else:
+            hf_shape = getattr(hf_pointer, weight_type).shape
     elif weight_type is not None and weight_type == "param":
         shape_pointer = hf_pointer
         for attribute in hf_param_name.split("."):
@@ -116,9 +125,15 @@ def set_recursively(key, value, full_name, weight_type, hf_pointer):
     if weight_type == "weight":
         hf_pointer.weight.data = value
     elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
+        if hasattr(hf_pointer, "weight_g"):
+            hf_pointer.weight_g.data = value
+        else:
+            hf_pointer.parametrizations.weight.original0.data = value
     elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
+        if hasattr(hf_pointer, "weight_v"):
+            hf_pointer.weight_v.data = value
+        else:
+            hf_pointer.parametrizations.weight.original1.data = value
     elif weight_type == "bias":
         hf_pointer.bias.data = value
     elif weight_type == "param":
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index e7c0f47b58e587..d65811cbc8efe6 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -189,7 +189,11 @@ class WhisperConfig(PretrainedConfig):
 
     model_type = "whisper"
     keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    attribute_map = {
+        "num_key_value_heads": "encoder_attention_heads",
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index aedc0c43aca752..f1467a55e03b9b 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -25,7 +25,8 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -244,6 +245,7 @@ def __init__(
         is_decoder: bool = False,
         bias: bool = True,
         is_causal: bool = False,
+        layer_idx: Optional[int] = None,
         config: Optional[WhisperConfig] = None,
     ):
         super().__init__()
@@ -262,6 +264,14 @@ def __init__(
         self.is_decoder = is_decoder
         self.is_causal = is_causal
 
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -271,84 +281,56 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    # Copied from transformers.models.bart.modeling_bart.BartAttention.forward with BART->whisper
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
             # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
         else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.reshape(*proj_shape)
-        value_states = value_states.reshape(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -358,42 +340,27 @@ def forward(
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights
 
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_probs, value_states)
 
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
         # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
-        return attn_output, attn_weights_reshaped, past_key_value
+        return attn_output, attn_weights, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->Whisper
 class WhisperFlashAttention2(WhisperAttention):
     """
     Whisper flash attention module. This module inherits from `WhisperAttention` as the weights of the module stays
@@ -410,18 +377,21 @@ def __init__(self, *args, **kwargs):
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
-    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
+                "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers"
+            )
         # WhisperFlashAttention2 attention does not support output_attentions
         if output_attentions:
             raise ValueError("WhisperFlashAttention2 attention does not support output_attentions")
@@ -429,51 +399,45 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-
-        bsz, q_len, _ = hidden_states.size()
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
-        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
             # reuse k,v, cross_attentions
-            key_states = past_key_value[0].transpose(1, 2)
-            value_states = past_key_value[1].transpose(1, 2)
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
         else:
-            # self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
+        #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
 
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
@@ -502,10 +466,10 @@ def forward(
             value_states = value_states.to(target_dtype)
 
         attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
+            query_states, key_states, value_states, causal_mask, tgt_len, dropout=self.dropout
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = attn_output.reshape(bsz, tgt_len, -1)
         attn_output = self.out_proj(attn_output)
 
         if not output_attentions:
@@ -614,15 +578,15 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 
 
 class WhisperSdpaAttention(WhisperAttention):
-    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with BART->whisper, Bart->Whisper
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
@@ -638,59 +602,50 @@ def forward(
                 attention_mask=attention_mask,
                 layer_head_mask=layer_head_mask,
                 output_attentions=output_attentions,
+                cache_position=cache_position,
             )
 
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
             # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
         else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        query_states = self._shape(query_states, tgt_len, bsz)
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
 
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+        is_causal = True if self.is_causal and causal_mask is None and tgt_len > 1 else False
 
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
@@ -698,7 +653,7 @@ def forward(
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
+            attn_mask=causal_mask,
             dropout_p=self.dropout if self.training else 0.0,
             is_causal=is_causal,
         )
@@ -798,9 +753,8 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Whisper, MBART->WHISPER
 class WhisperDecoderLayer(nn.Module):
-    def __init__(self, config: WhisperConfig):
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -810,6 +764,7 @@ def __init__(self, config: WhisperConfig):
             dropout=config.attention_dropout,
             is_decoder=True,
             is_causal=True,
+            layer_idx=layer_idx,
             config=config,
         )
         self.dropout = config.dropout
@@ -822,6 +777,7 @@ def __init__(self, config: WhisperConfig):
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
+            layer_idx=layer_idx,
             config=config,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -837,9 +793,10 @@ def forward(
         encoder_attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
         """
         Args:
@@ -863,41 +820,35 @@ def forward(
         hidden_states = self.self_attn_layer_norm(hidden_states)
 
         # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
+            past_key_value=past_key_value,
             attention_mask=attention_mask,
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
+            cache_position=cache_position,
         )
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
         # Cross-Attention Block
-        cross_attn_present_key_value = None
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             residual = hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
             hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
                 layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
+                past_key_value=past_key_value,
                 output_attentions=output_attentions,
             )
             hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
 
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
+            # add cross-attn to positions 1 of present_key_value tuple
+            present_key_value = (present_key_value, cross_attn_present_key_value)
 
         # Fully Connected
         residual = hidden_states
@@ -927,6 +878,8 @@ class WhisperPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -1024,14 +977,18 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+            four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
+            in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
+            when `config.use_cache=True`
+
+            Two formats are allowed:
+            - An [`~cache_utils.EncoderDecoderCache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
@@ -1051,6 +1008,9 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
+            in the correct position and to infer the complete sequence length.
 """
 
 WHISPER_ENCODER_INPUTS_DOCSTRING = r"""
@@ -1256,7 +1216,9 @@ def __init__(self, config: WhisperConfig):
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
         self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
 
-        self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList(
+            [WhisperDecoderLayer(config, layer_idx) for layer_idx in range(config.decoder_layers)]
+        )
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
 
@@ -1286,6 +1248,7 @@ def forward(
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        cache_position=None,
     ):
         r"""
         Args:
@@ -1320,13 +1283,17 @@ def forward(
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
 
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+                Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+                four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
+                in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
+                when `config.use_cache=True`
 
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                Two formats are allowed:
+                - An [`~cache_utils.EncoderDecoderCache`] instance;
+                - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+                `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
@@ -1344,6 +1311,9 @@ def forward(
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1363,26 +1333,38 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._use_sdpa and head_mask is None and not output_attentions:
-            # output_attentions=True & head_mask can not be supported when using SDPA.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
+        return_legacy_cache = False
+        return_self_attention_cache = False
+        if use_cache or past_key_values is not None:
+            if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
+                return_self_attention_cache = True
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            elif not isinstance(past_key_values, EncoderDecoderCache):
+                return_legacy_cache = True
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
+                    "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                    "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = 0
+        if cache_position is not None:
+            past_key_values_length = cache_position[0]
+        elif past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + input_shape[1], device=inputs_embeds.device
             )
 
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
         # embed positions
         if input_ids is not None:
             positions = self.embed_positions(
@@ -1396,6 +1378,14 @@ def forward(
         hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values.self_attention_cache if past_key_values is not None else None,
+            output_attentions,
+        )
+
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
@@ -1406,7 +1396,6 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-        next_decoder_cache = () if use_cache else None
 
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
         for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
@@ -1424,13 +1413,11 @@ def forward(
                 if dropout_probability < self.layerdrop:
                     continue
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
+                    causal_mask,
                     encoder_hidden_states,
                     None,  # encoder attention mask
                     head_mask[idx] if head_mask is not None else None,
@@ -1438,25 +1425,24 @@ def forward(
                     None,  # past_key_value
                     output_attentions,
                     use_cache,
+                    cache_position,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=causal_mask,
                     encoder_hidden_states=encoder_hidden_states,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
                         cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                     ),
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values if use_cache else None,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                 )
             hidden_states = layer_outputs[0]
 
-            if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
@@ -1468,7 +1454,11 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = past_key_values if use_cache else None
+        if return_self_attention_cache:
+            next_cache = past_key_values.self_attention_cache
+        if return_legacy_cache:
+            next_cache = past_key_values.to_legacy_cache()
         if not return_dict:
             return tuple(
                 v
@@ -1483,6 +1473,87 @@ def forward(
             cross_attentions=all_cross_attentions,
         )
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
 
 @add_start_docstrings(
     "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
@@ -1571,13 +1642,14 @@ def forward(
         decoder_head_mask: Optional[torch.Tensor] = None,
         cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
         decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
         r"""
         Returns:
@@ -1637,6 +1709,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         if not return_dict:
@@ -1704,7 +1777,7 @@ def forward(
         decoder_head_mask: Optional[torch.Tensor] = None,
         cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
         decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
         labels: Optional[torch.LongTensor] = None,
@@ -1712,6 +1785,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1766,6 +1840,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
         lm_logits = self.proj_out(outputs[0])
 
@@ -1800,14 +1875,19 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         attention_mask=None,
         decoder_attention_mask=None,
+        cache_position=None,
         **kwargs,
     ):
         decoder_position_ids = None
         if decoder_attention_mask is not None:
             decoder_position_ids = (decoder_attention_mask.cumsum(-1) - 1).clamp(min=0)
 
+        past_length = 0
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
+            if isinstance(past_key_values, EncoderDecoderCache):
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+            else:
+                past_length = past_key_values[0][0].shape[2]
 
             # Some generation methods already pass only the last input ID
             if decoder_input_ids.shape[1] > past_length:
@@ -1821,6 +1901,13 @@ def prepare_inputs_for_generation(
             if decoder_position_ids is not None and decoder_position_ids.shape[1] > decoder_input_ids.shape[1]:
                 decoder_position_ids = decoder_position_ids[:, remove_prefix_length:]
 
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_length, past_length + decoder_input_ids.shape[1], device=decoder_input_ids.device
+            )
+        elif use_cache:
+            cache_position = cache_position[-decoder_input_ids.shape[1] :]
+
         return {
             "encoder_outputs": encoder_outputs,
             "past_key_values": past_key_values,
@@ -1828,6 +1915,7 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,
             "decoder_attention_mask": decoder_attention_mask,
             "decoder_position_ids": decoder_position_ids,
+            "cache_position": cache_position,
         }
 
     @staticmethod
@@ -1914,6 +2002,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
         r"""
         Args:
@@ -1968,6 +2057,9 @@ def forward(
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
+                in the correct position and to infer the complete sequence length.
 
         Returns:
 
@@ -2019,6 +2111,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         logits = self.proj_out(outputs[0])
@@ -2049,10 +2142,15 @@ def prepare_inputs_for_generation(
         use_cache=None,
         encoder_outputs=None,
         attention_mask=None,
+        cache_position=None,
         **kwargs,
     ):
+        past_length = 0
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
+            if isinstance(past_key_values, (Cache, EncoderDecoderCache)):
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+            else:
+                past_length = past_key_values[0][0].shape[2]
 
             # Some generation methods already pass only the last input ID
             if input_ids.shape[1] > past_length:
@@ -2063,12 +2161,18 @@ def prepare_inputs_for_generation(
 
             input_ids = input_ids[:, remove_prefix_length:]
 
+        if cache_position is None:
+            cache_position = torch.arange(past_length, past_length + input_ids.shape[1], device=input_ids.device)
+        elif use_cache:
+            cache_position = cache_position[-input_ids.shape[1] :]
+
         return {
             "encoder_outputs": encoder_outputs,
             "past_key_values": past_key_values,
             "input_ids": input_ids,
             "use_cache": use_cache,
             "attention_mask": attention_mask,
+            "cache_position": cache_position,
         }
 
     @staticmethod
diff --git a/src/transformers/models/zoedepth/__init__.py b/src/transformers/models/zoedepth/__init__.py
new file mode 100644
index 00000000000000..15ba0883d83241
--- /dev/null
+++ b/src/transformers/models/zoedepth/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable
+
+
+_import_structure = {"configuration_zoedepth": ["ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP", "ZoeDepthConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_zoedepth"] = [
+        "ZoeDepthForDepthEstimation",
+        "ZoeDepthPreTrainedModel",
+    ]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_zoedepth"] = ["ZoeDepthImageProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_zoedepth import ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP, ZoeDepthConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_zoedepth import (
+            ZoeDepthForDepthEstimation,
+            ZoeDepthPreTrainedModel,
+        )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_zoedepth import ZoeDepthImageProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/zoedepth/configuration_zoedepth.py b/src/transformers/models/zoedepth/configuration_zoedepth.py
new file mode 100644
index 00000000000000..1b7e2695eb98c9
--- /dev/null
+++ b/src/transformers/models/zoedepth/configuration_zoedepth.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ZoeDepth model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Intel/zoedepth-nyu": "https://huggingface.co/Intel/zoedepth-nyu/resolve/main/config.json",
+}
+
+
+class ZoeDepthConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ZoeDepthForDepthEstimation`]. It is used to instantiate an ZoeDepth
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ZoeDepth
+    [Intel/zoedepth-nyu](https://huggingface.co/Intel/zoedepth-nyu) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `BeitConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        readout_type (`str`, *optional*, defaults to `"project"`):
+            The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
+            the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
+
+            - "ignore" simply ignores the CLS token.
+            - "add" passes the information from the CLS token to all other tokens by adding the representations.
+            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
+              projecting the
+            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
+        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
+            The up/downsampling factors of the reassemble layers.
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
+            The hidden sizes to project to for the feature maps of the backbone.
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the heads.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
+        num_relative_features (`int`, *optional*, defaults to 32):
+            The number of features to use in the relative depth estimation head.
+        add_projection (`bool`, *optional*, defaults to `False`):
+            Whether to add a projection layer before the depth estimation head.
+        bottleneck_features (`int`, *optional*, defaults to 256):
+            The number of features in the bottleneck layer.
+        num_attractors (`List[int], *optional*, defaults to `[16, 8, 4, 1]`):
+            The number of attractors to use in each stage.
+        bin_embedding_dim (`int`, *optional*, defaults to 128):
+            The dimension of the bin embeddings.
+        attractor_alpha (`int`, *optional*, defaults to 1000):
+            The alpha value to use in the attractor.
+        attractor_gamma (`int`, *optional*, defaults to 2):
+            The gamma value to use in the attractor.
+        attractor_kind (`str`, *optional*, defaults to `"mean"`):
+            The kind of attractor to use. Can be one of [`"mean"`, `"sum"`].
+        min_temp (`float`, *optional*, defaults to 0.0212):
+            The minimum temperature value to consider.
+        max_temp (`float`, *optional*, defaults to 50.0):
+            The maximum temperature value to consider.
+        bin_centers_type (`str`, *optional*, defaults to `"softplus"`):
+            Activation type used for bin centers. Can be "normed" or "softplus". For "normed" bin centers, linear normalization trick
+            is applied. This results in bounded bin centers. For "softplus", softplus activation is used and thus are unbounded.
+        bin_configurations (`List[dict]`, *optional*, defaults to `[{'n_bins': 64, 'min_depth': 0.001, 'max_depth': 10.0}]`):
+            Configuration for each of the bin heads.
+            Each configuration should consist of the following keys:
+            - name (`str`): The name of the bin head - only required in case of multiple bin configurations.
+            - `n_bins` (`int`): The number of bins to use.
+            - `min_depth` (`float`): The minimum depth value to consider.
+            - `max_depth` (`float`): The maximum depth value to consider.
+            In case only a single configuration is passed, the model will use a single head with the specified configuration.
+            In case multiple configurations are passed, the model will use multiple heads with the specified configurations.
+        num_patch_transformer_layers (`int`, *optional*):
+            The number of transformer layers to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_hidden_size (`int`, *optional*):
+            The hidden size to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_intermediate_size (`int`, *optional*):
+            The intermediate size to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_num_attention_heads (`int`, *optional*):
+            The number of attention heads to use in the patch transformer. Only used in case of multiple bin configurations.
+
+    Example:
+
+    ```python
+    >>> from transformers import ZoeDepthConfig, ZoeDepthForDepthEstimation
+
+    >>> # Initializing a ZoeDepth zoedepth-large style configuration
+    >>> configuration = ZoeDepthConfig()
+
+    >>> # Initializing a model from the zoedepth-large style configuration
+    >>> model = ZoeDepthForDepthEstimation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "zoedepth"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        backbone_kwargs=None,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        batch_norm_eps=1e-05,
+        readout_type="project",
+        reassemble_factors=[4, 2, 1, 0.5],
+        neck_hidden_sizes=[96, 192, 384, 768],
+        fusion_hidden_size=256,
+        head_in_index=-1,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=None,
+        num_relative_features=32,
+        add_projection=False,
+        bottleneck_features=256,
+        num_attractors=[16, 8, 4, 1],
+        bin_embedding_dim=128,
+        attractor_alpha=1000,
+        attractor_gamma=2,
+        attractor_kind="mean",
+        min_temp=0.0212,
+        max_temp=50.0,
+        bin_centers_type="softplus",
+        bin_configurations=[{"n_bins": 64, "min_depth": 0.001, "max_depth": 10.0}],
+        num_patch_transformer_layers=None,
+        patch_transformer_hidden_size=None,
+        patch_transformer_intermediate_size=None,
+        patch_transformer_num_attention_heads=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if readout_type not in ["ignore", "add", "project"]:
+            raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
+
+        if attractor_kind not in ["mean", "sum"]:
+            raise ValueError("Attractor_kind must be one of ['mean', 'sum']")
+
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `BEiT` backbone.")
+            backbone_config = CONFIG_MAPPING["beit"](
+                image_size=384,
+                num_hidden_layers=24,
+                hidden_size=1024,
+                intermediate_size=4096,
+                num_attention_heads=16,
+                use_relative_position_bias=True,
+                reshape_hidden_states=False,
+                out_features=["stage6", "stage12", "stage18", "stage24"],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.hidden_act = hidden_act
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.initializer_range = initializer_range
+        self.batch_norm_eps = batch_norm_eps
+        self.readout_type = readout_type
+        self.reassemble_factors = reassemble_factors
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.head_in_index = head_in_index
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
+        self.num_relative_features = num_relative_features
+        self.add_projection = add_projection
+
+        self.bottleneck_features = bottleneck_features
+        self.num_attractors = num_attractors
+        self.bin_embedding_dim = bin_embedding_dim
+        self.attractor_alpha = attractor_alpha
+        self.attractor_gamma = attractor_gamma
+        self.attractor_kind = attractor_kind
+        self.min_temp = min_temp
+        self.max_temp = max_temp
+        self.bin_centers_type = bin_centers_type
+        self.bin_configurations = bin_configurations
+        self.num_patch_transformer_layers = num_patch_transformer_layers
+        self.patch_transformer_hidden_size = patch_transformer_hidden_size
+        self.patch_transformer_intermediate_size = patch_transformer_intermediate_size
+        self.patch_transformer_num_attention_heads = patch_transformer_num_attention_heads
diff --git a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
new file mode 100644
index 00000000000000..9a6701c35bcdf9
--- /dev/null
+++ b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ZoeDepth checkpoints from the original repository. URL: https://github.com/isl-org/ZoeDepth.
+
+Original logits where obtained by running the following code:
+!git clone -b understanding_zoedepth https://github.com/NielsRogge/ZoeDepth
+!python inference.py
+"""
+
+import argparse
+from pathlib import Path
+
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import BeitConfig, ZoeDepthConfig, ZoeDepthForDepthEstimation, ZoeDepthImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_zoedepth_config(model_name):
+    image_size = 384
+    backbone_config = BeitConfig(
+        image_size=image_size,
+        num_hidden_layers=24,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_attention_heads=16,
+        use_relative_position_bias=True,
+        reshape_hidden_states=False,
+        out_features=["stage6", "stage12", "stage18", "stage24"],  # beit-large-512 uses [5, 11, 17, 23],
+    )
+
+    neck_hidden_sizes = [256, 512, 1024, 1024]
+    bin_centers_type = "softplus" if model_name in ["ZoeD_N", "ZoeD_NK"] else "normed"
+    if model_name == "ZoeD_NK":
+        bin_configurations = [
+            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
+            {"name": "kitti", "n_bins": 64, "min_depth": 1e-3, "max_depth": 80.0},
+        ]
+    elif model_name in ["ZoeD_N", "ZoeD_K"]:
+        bin_configurations = [
+            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
+        ]
+    config = ZoeDepthConfig(
+        backbone_config=backbone_config,
+        neck_hidden_sizes=neck_hidden_sizes,
+        bin_centers_type=bin_centers_type,
+        bin_configurations=bin_configurations,
+        num_patch_transformer_layers=4 if model_name == "ZoeD_NK" else None,
+        patch_transformer_hidden_size=128 if model_name == "ZoeD_NK" else None,
+        patch_transformer_intermediate_size=1024 if model_name == "ZoeD_NK" else None,
+        patch_transformer_num_attention_heads=4 if model_name == "ZoeD_NK" else None,
+    )
+
+    return config, image_size
+
+
+def rename_key(name):
+    # Transformer backbone
+    if "core.core.pretrained.model.blocks" in name:
+        name = name.replace("core.core.pretrained.model.blocks", "backbone.encoder.layer")
+    if "core.core.pretrained.model.patch_embed.proj" in name:
+        name = name.replace(
+            "core.core.pretrained.model.patch_embed.proj", "backbone.embeddings.patch_embeddings.projection"
+        )
+    if "core.core.pretrained.model.cls_token" in name:
+        name = name.replace("core.core.pretrained.model.cls_token", "backbone.embeddings.cls_token")
+    if "norm1" in name and "patch_transformer" not in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name and "patch_transformer" not in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "gamma_1" in name:
+        name = name.replace("gamma_1", "lambda_1")
+    if "gamma_2" in name:
+        name = name.replace("gamma_2", "lambda_2")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn.relative_position_bias_table" in name:
+        name = name.replace(
+            "attn.relative_position_bias_table",
+            "attention.attention.relative_position_bias.relative_position_bias_table",
+        )
+    if "attn.relative_position_index" in name:
+        name = name.replace(
+            "attn.relative_position_index", "attention.attention.relative_position_bias.relative_position_index"
+        )
+
+    # activation postprocessing (readout projections + resize blocks)
+    if "core.core.pretrained.act_postprocess1.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess1.0.project", "neck.reassemble_stage.readout_projects.0"
+        )
+    if "core.core.pretrained.act_postprocess2.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess2.0.project", "neck.reassemble_stage.readout_projects.1"
+        )
+    if "core.core.pretrained.act_postprocess3.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess3.0.project", "neck.reassemble_stage.readout_projects.2"
+        )
+    if "core.core.pretrained.act_postprocess4.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess4.0.project", "neck.reassemble_stage.readout_projects.3"
+        )
+
+    if "core.core.pretrained.act_postprocess1.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
+    if "core.core.pretrained.act_postprocess2.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
+    if "core.core.pretrained.act_postprocess3.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
+    if "core.core.pretrained.act_postprocess4.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
+
+    if "core.core.pretrained.act_postprocess1.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
+    if "core.core.pretrained.act_postprocess2.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
+    if "core.core.pretrained.act_postprocess4.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
+
+    # scratch convolutions
+    if "core.core.scratch.layer1_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer1_rn.weight", "neck.convs.0.weight")
+    if "core.core.scratch.layer2_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer2_rn.weight", "neck.convs.1.weight")
+    if "core.core.scratch.layer3_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer3_rn.weight", "neck.convs.2.weight")
+    if "core.core.scratch.layer4_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer4_rn.weight", "neck.convs.3.weight")
+
+    # fusion layers
+    # tricky here: mapping = {1:3, 2:2, 3:1, 4:0}
+    if "core.core.scratch.refinenet1" in name:
+        name = name.replace("core.core.scratch.refinenet1", "neck.fusion_stage.layers.3")
+    if "core.core.scratch.refinenet2" in name:
+        name = name.replace("core.core.scratch.refinenet2", "neck.fusion_stage.layers.2")
+    if "core.core.scratch.refinenet3" in name:
+        name = name.replace("core.core.scratch.refinenet3", "neck.fusion_stage.layers.1")
+    if "core.core.scratch.refinenet4" in name:
+        name = name.replace("core.core.scratch.refinenet4", "neck.fusion_stage.layers.0")
+
+    if "resConfUnit1" in name:
+        name = name.replace("resConfUnit1", "residual_layer1")
+
+    if "resConfUnit2" in name:
+        name = name.replace("resConfUnit2", "residual_layer2")
+
+    if "conv1" in name:
+        name = name.replace("conv1", "convolution1")
+
+    if "conv2" in name and "residual_layer" in name:
+        name = name.replace("conv2", "convolution2")
+
+    if "out_conv" in name:
+        name = name.replace("out_conv", "projection")
+
+    # relative depth estimation head
+    if "core.core.scratch.output_conv.0" in name:
+        name = name.replace("core.core.scratch.output_conv.0", "relative_head.conv1")
+
+    if "core.core.scratch.output_conv.2" in name:
+        name = name.replace("core.core.scratch.output_conv.2", "relative_head.conv2")
+
+    if "core.core.scratch.output_conv.4" in name:
+        name = name.replace("core.core.scratch.output_conv.4", "relative_head.conv3")
+
+    # patch transformer
+    if "patch_transformer" in name:
+        name = name.replace("patch_transformer", "metric_head.patch_transformer")
+
+    if "mlp_classifier.0" in name:
+        name = name.replace("mlp_classifier.0", "metric_head.mlp_classifier.linear1")
+    if "mlp_classifier.2" in name:
+        name = name.replace("mlp_classifier.2", "metric_head.mlp_classifier.linear2")
+
+    if "projectors" in name:
+        name = name.replace("projectors", "metric_head.projectors")
+
+    if "seed_bin_regressors" in name:
+        name = name.replace("seed_bin_regressors", "metric_head.seed_bin_regressors")
+
+    if "seed_bin_regressor" in name and "seed_bin_regressors" not in name:
+        name = name.replace("seed_bin_regressor", "metric_head.seed_bin_regressor")
+
+    if "seed_projector" in name:
+        name = name.replace("seed_projector", "metric_head.seed_projector")
+
+    if "_net.0" in name:
+        name = name.replace("_net.0", "conv1")
+
+    if "_net.2" in name:
+        name = name.replace("_net.2", "conv2")
+
+    if "attractors" in name:
+        name = name.replace("attractors", "metric_head.attractors")
+
+    if "conditional_log_binomial" in name:
+        name = name.replace("conditional_log_binomial", "metric_head.conditional_log_binomial")
+
+    # metric depth estimation head
+    if "conv2" in name and "metric_head" not in name and "attractors" not in name and "relative_head" not in name:
+        name = name.replace("conv2", "metric_head.conv2")
+
+    if "transformer_encoder.layers" in name:
+        name = name.replace("transformer_encoder.layers", "transformer_encoder")
+
+    return name
+
+
+def read_in_q_k_v_metric_head(state_dict):
+    hidden_size = 128
+    for i in range(4):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.weight"] = in_proj_weight[
+            :hidden_size, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
+
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.bias"] = in_proj_bias[
+            hidden_size : hidden_size * 2
+        ]
+
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.weight"] = in_proj_weight[
+            -hidden_size:, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
+
+
+def convert_state_dict(orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        # rename key
+        new_name = rename_key(key)
+        orig_state_dict[new_name] = val
+
+    return orig_state_dict
+
+
+def remove_ignore_keys(state_dict):
+    for key, _ in state_dict.copy().items():
+        if (
+            "fc_norm" in key
+            or "relative_position_index" in key
+            or "k_idx" in key
+            or "K_minus_1" in key
+            or "core.core.pretrained.model.head" in key
+        ):
+            state_dict.pop(key, None)
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    hidden_size = config.backbone_config.hidden_size
+    for i in range(config.backbone_config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.qkv.weight")
+        q_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.v_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
+
+
+# We will verify our results on an image
+def prepare_img():
+    filepath = hf_hub_download(repo_id="shariqfarooq/ZoeDepth", filename="examples/person_1.jpeg", repo_type="space")
+    image = Image.open(filepath).convert("RGB")
+    return image
+
+
+@torch.no_grad()
+def convert_zoedepth_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our ZoeDepth structure.
+    """
+
+    # define ZoeDepth configuration based on URL
+    config, _ = get_zoedepth_config(model_name)
+
+    # load original model
+    original_model = torch.hub.load(
+        "NielsRogge/ZoeDepth:understanding_zoedepth", model_name, pretrained=True, force_reload=True
+    )
+    original_model.eval()
+    state_dict = original_model.state_dict()
+
+    print("Original state dict:")
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
+    # read in qkv matrices
+    read_in_q_k_v(state_dict, config)
+    if model_name == "ZoeD_NK":
+        read_in_q_k_v_metric_head(state_dict)
+
+    # rename keys
+    state_dict = convert_state_dict(state_dict)
+    # remove certain keys
+    remove_ignore_keys(state_dict)
+
+    # load HuggingFace model
+    model = ZoeDepthForDepthEstimation(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # verify image processor
+    image = prepare_img()
+
+    image_processor = ZoeDepthImageProcessor()
+    pixel_values = image_processor(image, return_tensors="pt").pixel_values
+    filepath = hf_hub_download(
+        repo_id="nielsr/test-image",
+        filename="zoedepth_pixel_values.pt",
+        repo_type="dataset",
+    )
+    original_pixel_values = torch.load(filepath, map_location="cpu")
+    assert torch.allclose(pixel_values, original_pixel_values)
+
+    # verify logits
+    # this was done on a resized version of the cats image (384x384)
+    filepath = hf_hub_download(
+        repo_id="nielsr/test-image",
+        filename="zoedepth_pixel_values.pt",
+        repo_type="dataset",
+        revision="1865dbb81984f01c89e83eec10f8d07efd10743d",
+    )
+    cats_pixel_values = torch.load(filepath, map_location="cpu")
+    depth = model(cats_pixel_values).predicted_depth
+
+    # Verify logits
+    # These were obtained by inserting the pixel_values at the patch embeddings of BEiT
+    if model_name == "ZoeD_N":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.0328, 1.0604, 1.0747], [1.0816, 1.1293, 1.1456], [1.1117, 1.1629, 1.1766]])
+    elif model_name == "ZoeD_K":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.6567, 1.6852, 1.7065], [1.6707, 1.6764, 1.6713], [1.7195, 1.7166, 1.7118]])
+    elif model_name == "ZoeD_NK":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.1228, 1.1079, 1.1382], [1.1807, 1.1658, 1.1891], [1.2344, 1.2094, 1.2317]])
+
+    print("Shape of depth:", depth.shape)
+    print("First 3x3 slice of depth:", depth[0, :3, :3])
+
+    assert depth.shape == torch.Size(expected_shape)
+    assert torch.allclose(depth[0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_repo_id = {
+            "ZoeD_N": "zoedepth-nyu",
+            "ZoeD_K": "zoedepth-kitti",
+            "ZoeD_NK": "zoedepth-nyu-kitti",
+        }
+
+        print("Pushing model and processor to the hub...")
+        repo_id = model_name_to_repo_id[model_name]
+        model.push_to_hub(f"Intel/{repo_id}")
+        image_processor = ZoeDepthImageProcessor()
+        image_processor.push_to_hub(f"Intel/{repo_id}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="ZoeD_N",
+        choices=["ZoeD_N", "ZoeD_K", "ZoeD_NK"],
+        type=str,
+        help="Name of the original ZoeDepth checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    convert_zoedepth_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
new file mode 100644
index 00000000000000..5276f2239151e8
--- /dev/null
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -0,0 +1,454 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ZoeDepth."""
+
+import math
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import PaddingMode, pad, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_torch_available, is_vision_available, logging, requires_backends
+
+
+if is_vision_available():
+    import PIL
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    def constrain_to_multiple_of(val, multiple, min_val=0):
+        x = (np.round(val / multiple) * multiple).astype(int)
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
+
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return (new_height, new_width)
+
+
+class ZoeDepthImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ZoeDepth image processor.
+
+    Args:
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to apply pad the input.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 512}`):
+            Size of the image after resizing. Size of the image after resizing. If `keep_aspect_ratio` is `True`,
+            the image is resized by choosing the smaller of the height and width scaling factors and using it for both dimensions.
+            If `ensure_multiple_of` is also set, the image is further resized to a size that is a multiple of this value.
+            Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+        keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
+            If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it for
+            both dimensions. This ensures that the image is scaled down as little as possible while still fitting within the
+            desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a size that is a
+            multiple of this value by flooring the height and width to the nearest multiple of this value.
+            Can be overidden by `keep_aspect_ratio` in `preprocess`.
+        ensure_multiple_of (`int`, *optional*, defaults to 32):
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
+            the height and width to the nearest multiple of this value.
+
+            Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by `ensure_multiple_of`
+            in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_pad: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        keep_aspect_ratio: bool = True,
+        ensure_multiple_of: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        size = size if size is not None else {"height": 384, "width": 512}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.ensure_multiple_of = ensure_multiple_of
+        self.resample = resample
+
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "keep_aspect_ratio",
+            "ensure_multiple_of",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
+                specified in `size`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        data_format = data_format if data_format is not None else input_data_format
+
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size["height"], size["width"]),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+            input_data_format=input_data_format,
+        )
+
+        height, width = output_size
+
+        torch_image = torch.from_numpy(image).unsqueeze(0)
+        torch_image = torch_image.permute(0, 3, 1, 2) if input_data_format == "channels_last" else torch_image
+
+        # TODO support align_corners=True in image_transforms.resize
+        requires_backends(self, "torch")
+        resample_to_mode = {PILImageResampling.BILINEAR: "bilinear", PILImageResampling.BICUBIC: "bicubic"}
+        mode = resample_to_mode[resample]
+        resized_image = nn.functional.interpolate(
+            torch_image, (int(height), int(width)), mode=mode, align_corners=True
+        )
+        resized_image = resized_image.squeeze().numpy()
+
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.FIRST
+        )
+
+        return resized_image
+
+    def pad_image(
+        self,
+        image: np.array,
+        mode: PaddingMode = PaddingMode.REFLECT,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pad an image as done in the original ZoeDepth implementation.
+
+        Padding fixes the boundary artifacts in the output depth map.
+        Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset
+        which has a black or white border around the image. This function pads the input image and crops
+        the prediction back to the original size / view.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            mode (`PaddingMode`):
+                The padding mode to use. Can be one of:
+                    - `"constant"`: pads with a constant value.
+                    - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                    vector along each axis.
+                    - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                    - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        height, width = get_image_size(image, input_data_format)
+
+        pad_height = int(np.sqrt(height / 2) * 3)
+        pad_width = int(np.sqrt(width / 2) * 3)
+
+        return pad(
+            image,
+            padding=((pad_height, pad_height), (pad_width, pad_width)),
+            mode=mode,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_pad: bool = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: bool = None,
+        size: int = None,
+        keep_aspect_ratio: bool = None,
+        ensure_multiple_of: int = None,
+        resample: PILImageResampling = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the input image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. If `keep_aspect_ratio` is `True`, he image is resized by choosing the smaller of
+                the height and width scaling factors and using it for both dimensions. If `ensure_multiple_of` is also set,
+                the image is further resized to a size that is a multiple of this value.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
+                If `True` and `do_resize=True`, the image is resized by choosing the smaller of the height and width scaling factors and using it for
+                both dimensions. This ensures that the image is scaled down as little as possible while still fitting within the
+                desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a size that is a
+                multiple of this value by flooring the height and width to the nearest multiple of this value.
+            ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
+                If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
+                the height and width to the nearest multiple of this value.
+
+                Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by `ensure_multiple_of` in `preprocess`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
+        ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_pad:
+            images = [self.pad_image(image=image, input_data_format=input_data_format) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                    ensure_multiple_of=ensure_multiple_of,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py
new file mode 100644
index 00000000000000..f03f775d1e4faf
--- /dev/null
+++ b/src/transformers/models/zoedepth/modeling_zoedepth.py
@@ -0,0 +1,1403 @@
+# coding=utf-8
+# Copyright 2024 Intel Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ZoeDepth model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, logging
+from ...utils.backbone_utils import load_backbone
+from .configuration_zoedepth import ZoeDepthConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ZoeDepthConfig"
+
+
+@dataclass
+class ZoeDepthDepthEstimatorOutput(ModelOutput):
+    """
+    Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+
+        domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
+            Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: torch.FloatTensor = None
+    domain_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class ZoeDepthReassembleStage(nn.Module):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+
+    This happens in 3 stages:
+    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
+       `config.readout_type`.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.readout_type = config.readout_type
+        self.layers = nn.ModuleList()
+
+        for neck_hidden_size, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
+            self.layers.append(ZoeDepthReassembleLayer(config, channels=neck_hidden_size, factor=factor))
+
+        if config.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            hidden_size = config.backbone_hidden_size
+            for _ in config.neck_hidden_sizes:
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        batch_size = hidden_states[0].shape[0]
+
+        # stack along batch dimension
+        # shape (batch_size*num_stages, sequence_length + 1, hidden_size)
+        hidden_states = torch.cat(hidden_states, dim=0)
+
+        cls_token, hidden_states = hidden_states[:, 0], hidden_states[:, 1:]
+        # reshape hidden_states to (batch_size*num_stages, num_channels, height, width)
+        total_batch_size, sequence_length, num_channels = hidden_states.shape
+        hidden_states = hidden_states.reshape(total_batch_size, patch_height, patch_width, num_channels)
+        hidden_states = hidden_states.permute(0, 3, 1, 2).contiguous()
+
+        if self.readout_type == "project":
+            # reshape to (batch_size*num_stages, height*width, num_channels)
+            hidden_states = hidden_states.flatten(2).permute((0, 2, 1))
+            readout = cls_token.unsqueeze(dim=1).expand_as(hidden_states)
+            # concatenate the readout token to the hidden states
+            # to get (batch_size*num_stages, height*width, 2*num_channels)
+            hidden_states = torch.cat((hidden_states, readout), -1)
+        elif self.readout_type == "add":
+            hidden_states = hidden_states + cls_token.unsqueeze(-1)
+
+        out = []
+        for stage_idx, hidden_state in enumerate(hidden_states.split(batch_size, dim=0)):
+            if self.readout_type == "project":
+                hidden_state = self.readout_projects[stage_idx](hidden_state)
+
+            # reshape back to (batch_size, num_channels, height, width)
+            hidden_state = hidden_state.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width)
+            hidden_state = self.layers[stage_idx](hidden_state)
+            out.append(hidden_state)
+
+        return out
+
+
+class ZoeDepthReassembleLayer(nn.Module):
+    def __init__(self, config, channels, factor):
+        super().__init__()
+        # projection
+        hidden_size = config.backbone_hidden_size
+        self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+
+    # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward with DPT->ZoeDepth
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->ZoeDepth
+class ZoeDepthFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(ZoeDepthFeatureFusionLayer(config))
+
+    def forward(self, hidden_states):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        fused_hidden_states.append(fused_hidden_state)
+        # looping from the last layer to the second
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer with DPT->ZoeDepth
+class ZoeDepthPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    # Ignore copy
+    def __init__(self, config):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer with DPT->ZoeDepth
+class ZoeDepthFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+        align_corners (`bool`, *optional*, defaults to `True`):
+            The align_corner setting for bilinear upsample.
+    """
+
+    def __init__(self, config, align_corners=True):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+        self.residual_layer1 = ZoeDepthPreActResidualLayer(config)
+        self.residual_layer2 = ZoeDepthPreActResidualLayer(config)
+
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+        hidden_state = nn.functional.interpolate(
+            hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+class ZoeDepthNeck(nn.Module):
+    """
+    ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:
+
+    * ZoeDepthReassembleStage
+    * ZoeDepthFeatureFusionStage.
+
+    Args:
+        config (dict): config dict.
+    """
+
+    # Copied from transformers.models.dpt.modeling_dpt.DPTNeck.__init__ with DPT->ZoeDepth
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
+        if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
+            self.reassemble_stage = None
+        else:
+            self.reassemble_stage = ZoeDepthReassembleStage(config)
+
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+
+        # fusion
+        self.fusion_stage = ZoeDepthFeatureFusionStage(config)
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise ValueError("hidden_states should be a tuple or list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        if self.reassemble_stage is not None:
+            hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+
+        # fusion blocks
+        output = self.fusion_stage(features)
+
+        return output, features[-1]
+
+
+class ZoeDepthRelativeDepthEstimationHead(nn.Module):
+    """
+    Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
+    supplementary material).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_in_index = config.head_in_index
+
+        self.projection = None
+        if config.add_projection:
+            self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+
+        features = config.fusion_hidden_size
+        self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
+        self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+        self.conv2 = nn.Conv2d(features // 2, config.num_relative_features, kernel_size=3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(config.num_relative_features, 1, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.head_in_index]
+
+        if self.projection is not None:
+            hidden_states = self.projection(hidden_states)
+            hidden_states = nn.ReLU()(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.upsample(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = nn.ReLU()(hidden_states)
+        # we need the features here (after second conv + ReLu)
+        features = hidden_states
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = nn.ReLU()(hidden_states)
+
+        predicted_depth = hidden_states.squeeze(dim=1)
+
+        return predicted_depth, features
+
+
+def log_binom(n, k, eps=1e-7):
+    """log(nCk) using stirling approximation"""
+    n = n + eps
+    k = k + eps
+    return n * torch.log(n) - k * torch.log(k) - (n - k) * torch.log(n - k + eps)
+
+
+class LogBinomialSoftmax(nn.Module):
+    def __init__(self, n_classes=256, act=torch.softmax):
+        """Compute log binomial distribution for n_classes
+
+        Args:
+            n_classes (`int`, *optional*, defaults to 256):
+                Number of output classes.
+            act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
+                Activation function to apply to the output.
+        """
+        super().__init__()
+        self.k = n_classes
+        self.act = act
+        self.register_buffer("k_idx", torch.arange(0, n_classes).view(1, -1, 1, 1), persistent=False)
+        self.register_buffer("k_minus_1", torch.Tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False)
+
+    def forward(self, probabilities, temperature=1.0, eps=1e-4):
+        """Compute the log binomial distribution for probabilities.
+
+        Args:
+            probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Tensor containing probabilities of each class.
+            temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
+                Temperature of distribution.
+            eps (`float`, *optional*, defaults to 1e-4):
+                Small number for numerical stability.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
+                Log binomial distribution logbinomial(p;t).
+        """
+        if probabilities.ndim == 3:
+            probabilities = probabilities.unsqueeze(1)  # make it (batch_size, num_channels, height, width)
+
+        one_minus_probabilities = torch.clamp(1 - probabilities, eps, 1)
+        probabilities = torch.clamp(probabilities, eps, 1)
+        y = (
+            log_binom(self.k_minus_1, self.k_idx)
+            + self.k_idx * torch.log(probabilities)
+            + (self.k_minus_1 - self.k_idx) * torch.log(one_minus_probabilities)
+        )
+        return self.act(y / temperature, dim=1)
+
+
+class ZoeDepthConditionalLogBinomialSoftmax(nn.Module):
+    def __init__(
+        self,
+        config,
+        in_features,
+        condition_dim,
+        n_classes=256,
+        bottleneck_factor=2,
+    ):
+        """Per-pixel MLP followed by a Conditional Log Binomial softmax.
+
+        Args:
+            in_features (`int`):
+                Number of input channels in the main feature.
+            condition_dim (`int`):
+                Number of input channels in the condition feature.
+            n_classes (`int`, *optional*, defaults to 256):
+                Number of classes.
+            bottleneck_factor (`int`, *optional*, defaults to 2):
+                Hidden dim factor.
+
+        """
+        super().__init__()
+
+        bottleneck = (in_features + condition_dim) // bottleneck_factor
+        self.mlp = nn.Sequential(
+            nn.Conv2d(in_features + condition_dim, bottleneck, kernel_size=1, stride=1, padding=0),
+            nn.GELU(),
+            # 2 for probabilities linear norm, 2 for temperature linear norm
+            nn.Conv2d(bottleneck, 2 + 2, kernel_size=1, stride=1, padding=0),
+            nn.Softplus(),
+        )
+
+        self.p_eps = 1e-4
+        self.max_temp = config.max_temp
+        self.min_temp = config.min_temp
+        self.log_binomial_transform = LogBinomialSoftmax(n_classes, act=torch.softmax)
+
+    def forward(self, main_feature, condition_feature):
+        """
+        Args:
+            main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Main feature.
+            condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
+                Condition feature.
+
+        Returns:
+            `torch.Tensor`:
+                Output log binomial distribution
+        """
+        probabilities_and_temperature = self.mlp(torch.concat((main_feature, condition_feature), dim=1))
+        probabilities, temperature = (
+            probabilities_and_temperature[:, :2, ...],
+            probabilities_and_temperature[:, 2:, ...],
+        )
+
+        probabilities = probabilities + self.p_eps
+        probabilities = probabilities[:, 0, ...] / (probabilities[:, 0, ...] + probabilities[:, 1, ...])
+
+        temperature = temperature + self.p_eps
+        temperature = temperature[:, 0, ...] / (temperature[:, 0, ...] + temperature[:, 1, ...])
+        temperature = temperature.unsqueeze(1)
+        temperature = (self.max_temp - self.min_temp) * temperature + self.min_temp
+
+        return self.log_binomial_transform(probabilities, temperature)
+
+
+class ZoeDepthSeedBinRegressor(nn.Module):
+    def __init__(self, config, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+        """Bin center regressor network.
+
+        Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.
+
+        Args:
+            config (`int`):
+                Model configuration.
+            n_bins (`int`, *optional*, defaults to 16):
+                Number of bin centers.
+            mlp_dim (`int`, *optional*, defaults to 256):
+                Hidden dimension.
+            min_depth (`float`, *optional*, defaults to 1e-3):
+                Min depth value.
+            max_depth (`float`, *optional*, defaults to 10):
+                Max depth value.
+        """
+        super().__init__()
+
+        self.in_features = config.bottleneck_features
+        self.bin_centers_type = config.bin_centers_type
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+
+        self.conv1 = nn.Conv2d(self.in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_bins, 1, 1, 0)
+        self.act2 = nn.ReLU(inplace=True) if self.bin_centers_type == "normed" else nn.Softplus()
+
+    def forward(self, x):
+        """
+        Returns tensor of bin_width vectors (centers). One vector b for every pixel
+        """
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        bin_centers = self.act2(x)
+
+        if self.bin_centers_type == "normed":
+            bin_centers = bin_centers + 1e-3
+            bin_widths_normed = bin_centers / bin_centers.sum(dim=1, keepdim=True)
+            # shape (batch_size, num_channels, height, width)
+            bin_widths = (self.max_depth - self.min_depth) * bin_widths_normed
+            # pad has the form (left, right, top, bottom, front, back)
+            bin_widths = nn.functional.pad(bin_widths, (0, 0, 0, 0, 1, 0), mode="constant", value=self.min_depth)
+            # shape (batch_size, num_channels, height, width)
+            bin_edges = torch.cumsum(bin_widths, dim=1)
+
+            bin_centers = 0.5 * (bin_edges[:, :-1, ...] + bin_edges[:, 1:, ...])
+            return bin_widths_normed, bin_centers
+
+        else:
+            return bin_centers, bin_centers
+
+
+@torch.jit.script
+def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
+    """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
+    This is the default one according to the accompanying paper.
+
+    Args:
+        dx (`torch.Tensor`):
+            The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+        alpha (`float`, *optional*, defaults to 300):
+            Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
+        gamma (`int`, *optional*, defaults to 2):
+            Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
+            Lower gamma = farther reach.
+
+    Returns:
+        torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
+    """
+    return dx.div(1 + alpha * dx.pow(gamma))
+
+
+class ZoeDepthAttractorLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_bins,
+        n_attractors=16,
+        min_depth=1e-3,
+        max_depth=10,
+        memory_efficient=False,
+    ):
+        """
+        Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
+        """
+        super().__init__()
+
+        self.alpha = config.attractor_alpha
+        self.gemma = config.attractor_gamma
+        self.kind = config.attractor_kind
+
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.memory_efficient = memory_efficient
+
+        # MLP to predict attractor points
+        in_features = mlp_dim = config.bin_embedding_dim
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_attractors * 2, 1, 1, 0)  # x2 for linear norm
+        self.act2 = nn.ReLU(inplace=True)
+
+    def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
+        """
+        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
+        and the attractor points (the latter are predicted by the MLP).
+
+        Args:
+            x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Feature block.
+            prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
+                Previous bin centers normed.
+            prev_bin_embedding (`torch.Tensor`, *optional*):
+                Optional previous bin embeddings.
+            interpolate (`bool`, *optional*, defaults to `True`):
+                Whether to interpolate the previous bin embeddings to the size of the input features.
+
+        Returns:
+            `Tuple[`torch.Tensor`, `torch.Tensor`]:
+                New bin centers normed and scaled.
+        """
+        if prev_bin_embedding is not None:
+            if interpolate:
+                prev_bin_embedding = nn.functional.interpolate(
+                    prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
+                )
+            x = x + prev_bin_embedding
+
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        attractors = self.act2(x)
+
+        attractors = attractors + 1e-3
+        batch_size, _, height, width = attractors.shape
+        attractors = attractors.view(batch_size, self.n_attractors, 2, height, width)
+        # batch_size, num_attractors, 2, height, width
+        # note: original repo had a bug here: https://github.com/isl-org/ZoeDepth/blame/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/layers/attractor.py#L105C9-L106C50
+        # we include the bug to maintain compatibility with the weights
+        attractors_normed = attractors[:, :, 0, ...]  # batch_size, batch_size*num_attractors, height, width
+
+        bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
+
+        # note: only attractor_type = "exp" is supported here, since no checkpoints were released with other attractor types
+
+        if not self.memory_efficient:
+            func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
+            # shape (batch_size, num_bins, height, width)
+            delta_c = func(inv_attractor(attractors_normed.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
+            for i in range(self.n_attractors):
+                # shape (batch_size, num_bins, height, width)
+                delta_c += inv_attractor(attractors_normed[:, i, ...].unsqueeze(1) - bin_centers)
+
+            if self.kind == "mean":
+                delta_c = delta_c / self.n_attractors
+
+        bin_new_centers = bin_centers + delta_c
+        bin_centers = (self.max_depth - self.min_depth) * bin_new_centers + self.min_depth
+        bin_centers, _ = torch.sort(bin_centers, dim=1)
+        bin_centers = torch.clip(bin_centers, self.min_depth, self.max_depth)
+        return bin_new_centers, bin_centers
+
+
+class ZoeDepthAttractorLayerUnnormed(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_bins,
+        n_attractors=16,
+        min_depth=1e-3,
+        max_depth=10,
+        memory_efficient=True,
+    ):
+        """
+        Attractor layer for bin centers. Bin centers are unbounded
+        """
+        super().__init__()
+
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.alpha = config.attractor_alpha
+        self.gamma = config.attractor_alpha
+        self.kind = config.attractor_kind
+        self.memory_efficient = memory_efficient
+
+        in_features = mlp_dim = config.bin_embedding_dim
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0)
+        self.act2 = nn.Softplus()
+
+    def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
+        """
+        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
+        and the attractor points (the latter are predicted by the MLP).
+
+        Args:
+            x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
+                Feature block.
+            prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
+                Previous bin centers normed.
+            prev_bin_embedding (`torch.Tensor`, *optional*):
+                Optional previous bin embeddings.
+            interpolate (`bool`, *optional*, defaults to `True`):
+                Whether to interpolate the previous bin embeddings to the size of the input features.
+
+        Returns:
+            `Tuple[`torch.Tensor`, `torch.Tensor`]:
+                New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
+        """
+        if prev_bin_embedding is not None:
+            if interpolate:
+                prev_bin_embedding = nn.functional.interpolate(
+                    prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
+                )
+            x = x + prev_bin_embedding
+
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        attractors = self.act2(x)
+
+        height, width = attractors.shape[-2:]
+
+        bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
+
+        if not self.memory_efficient:
+            func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
+            # shape batch_size, num_bins, height, width
+            delta_c = func(inv_attractor(attractors.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
+            for i in range(self.n_attractors):
+                # shape batch_size, num_bins, height, width
+                delta_c += inv_attractor(attractors[:, i, ...].unsqueeze(1) - bin_centers)
+
+            if self.kind == "mean":
+                delta_c = delta_c / self.n_attractors
+
+        bin_new_centers = bin_centers + delta_c
+        bin_centers = bin_new_centers
+
+        return bin_new_centers, bin_centers
+
+
+class ZoeDepthProjector(nn.Module):
+    def __init__(self, in_features, out_features, mlp_dim=128):
+        """Projector MLP.
+
+        Args:
+            in_features (`int`):
+                Number of input channels.
+            out_features (`int`):
+                Number of output channels.
+            mlp_dim (`int`, *optional*, defaults to 128):
+                Hidden dimension.
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, out_features, 1, 1, 0)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+# Copied from transformers.models.grounding_dino.modeling_grounding_dino.GroundingDinoMultiheadAttention with GroundingDino->ZoeDepth
+class ZoeDepthMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
+
+    # Ignore copy
+    def __init__(self, hidden_size, num_attention_heads, dropout):
+        super().__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
+                f"heads ({num_attention_heads})"
+            )
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(queries))
+        key_layer = self.transpose_for_scores(self.key(keys))
+        value_layer = self.transpose_for_scores(self.value(values))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ZoeDepthModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ZoeDepthTransformerEncoderLayer(nn.Module):
+    def __init__(self, config, dropout=0.1, activation="relu"):
+        super().__init__()
+
+        hidden_size = config.patch_transformer_hidden_size
+        intermediate_size = config.patch_transformer_intermediate_size
+        num_attention_heads = config.patch_transformer_num_attention_heads
+
+        self.self_attn = ZoeDepthMultiheadAttention(hidden_size, num_attention_heads, dropout=dropout)
+
+        self.linear1 = nn.Linear(hidden_size, intermediate_size)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(intermediate_size, hidden_size)
+
+        self.norm1 = nn.LayerNorm(hidden_size)
+        self.norm2 = nn.LayerNorm(hidden_size)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+    ):
+        queries = keys = src
+        src2 = self.self_attn(queries=queries, keys=keys, values=src, attention_mask=src_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+
+class ZoeDepthPatchTransformerEncoder(nn.Module):
+    def __init__(self, config):
+        """ViT-like transformer block
+
+        Args:
+            config (`ZoeDepthConfig`):
+                Model configuration class defining the model architecture.
+        """
+        super().__init__()
+
+        in_channels = config.bottleneck_features
+
+        self.transformer_encoder = nn.ModuleList(
+            [ZoeDepthTransformerEncoderLayer(config) for _ in range(config.num_patch_transformer_layers)]
+        )
+
+        self.embedding_convPxP = nn.Conv2d(
+            in_channels, config.patch_transformer_hidden_size, kernel_size=1, stride=1, padding=0
+        )
+
+    def positional_encoding_1d(self, batch_size, sequence_length, embedding_dim, device="cpu", dtype=torch.float32):
+        """Generate positional encodings
+
+        Args:
+            sequence_length (int): Sequence length
+            embedding_dim (int): Embedding dimension
+
+        Returns:
+            torch.Tensor: Positional encodings.
+        """
+        position = torch.arange(0, sequence_length, dtype=dtype, device=device).unsqueeze(1)
+        index = torch.arange(0, embedding_dim, 2, dtype=dtype, device=device).unsqueeze(0)
+        div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
+        pos_encoding = position * div_term
+        pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
+        pos_encoding = pos_encoding.unsqueeze(dim=0).repeat(batch_size, 1, 1)
+        return pos_encoding
+
+    def forward(self, x):
+        """Forward pass
+
+        Args:
+            x (torch.Tensor - NCHW): Input feature tensor
+
+        Returns:
+            torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
+        """
+        embeddings = self.embedding_convPxP(x).flatten(2)  # shape (batch_size, num_channels, sequence_length)
+        # add an extra special CLS token at the start for global accumulation
+        embeddings = nn.functional.pad(embeddings, (1, 0))
+
+        embeddings = embeddings.permute(0, 2, 1)
+        batch_size, sequence_length, embedding_dim = embeddings.shape
+        embeddings = embeddings + self.positional_encoding_1d(
+            batch_size, sequence_length, embedding_dim, device=embeddings.device, dtype=embeddings.dtype
+        )
+
+        for i in range(4):
+            embeddings = self.transformer_encoder[i](embeddings)
+
+        return embeddings
+
+
+class ZoeDepthMLPClassifier(nn.Module):
+    def __init__(self, in_features, out_features) -> None:
+        super().__init__()
+
+        hidden_features = in_features
+        self.linear1 = nn.Linear(in_features, hidden_features)
+        self.activation = nn.ReLU()
+        self.linear2 = nn.Linear(hidden_features, out_features)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        domain_logits = self.linear2(hidden_state)
+
+        return domain_logits
+
+
+class ZoeDepthMultipleMetricDepthEstimationHeads(nn.Module):
+    """
+    Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        bin_embedding_dim = config.bin_embedding_dim
+        n_attractors = config.num_attractors
+        self.bin_configurations = config.bin_configurations
+        self.bin_centers_type = config.bin_centers_type
+
+        # Bottleneck convolution
+        bottleneck_features = config.bottleneck_features
+        self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
+
+        # Transformer classifier on the bottleneck
+        self.patch_transformer = ZoeDepthPatchTransformerEncoder(config)
+        # MLP classifier
+        self.mlp_classifier = ZoeDepthMLPClassifier(in_features=128, out_features=2)
+
+        # Regressor and attractor
+        if self.bin_centers_type == "normed":
+            Attractor = ZoeDepthAttractorLayer
+        elif self.bin_centers_type == "softplus":
+            Attractor = ZoeDepthAttractorLayerUnnormed
+        # We have bins for each bin configuration
+        # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
+        self.seed_bin_regressors = nn.ModuleDict(
+            {
+                conf["name"]: ZoeDepthSeedBinRegressor(
+                    config,
+                    n_bins=conf["n_bins"],
+                    mlp_dim=bin_embedding_dim // 2,
+                    min_depth=conf["min_depth"],
+                    max_depth=conf["max_depth"],
+                )
+                for conf in config.bin_configurations
+            }
+        )
+
+        self.seed_projector = ZoeDepthProjector(
+            in_features=bottleneck_features, out_features=bin_embedding_dim, mlp_dim=bin_embedding_dim // 2
+        )
+        self.projectors = nn.ModuleList(
+            [
+                ZoeDepthProjector(
+                    in_features=config.fusion_hidden_size,
+                    out_features=bin_embedding_dim,
+                    mlp_dim=bin_embedding_dim // 2,
+                )
+                for _ in range(4)
+            ]
+        )
+
+        # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
+        self.attractors = nn.ModuleDict(
+            {
+                configuration["name"]: nn.ModuleList(
+                    [
+                        Attractor(
+                            config,
+                            n_bins=n_attractors[i],
+                            min_depth=configuration["min_depth"],
+                            max_depth=configuration["max_depth"],
+                        )
+                        for i in range(len(n_attractors))
+                    ]
+                )
+                for configuration in config.bin_configurations
+            }
+        )
+
+        last_in = config.num_relative_features
+        # conditional log binomial for each bin configuration
+        self.conditional_log_binomial = nn.ModuleDict(
+            {
+                configuration["name"]: ZoeDepthConditionalLogBinomialSoftmax(
+                    config,
+                    last_in,
+                    bin_embedding_dim,
+                    configuration["n_bins"],
+                    bottleneck_factor=4,
+                )
+                for configuration in config.bin_configurations
+            }
+        )
+
+    def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
+        x = self.conv2(bottleneck)
+
+        # Predict which path to take
+        # Embedding is of shape (batch_size, hidden_size)
+        embedding = self.patch_transformer(x)[:, 0, :]
+
+        # MLP classifier to get logits of shape (batch_size, 2)
+        domain_logits = self.mlp_classifier(embedding)
+        domain_vote = torch.softmax(domain_logits.sum(dim=0, keepdim=True), dim=-1)
+
+        # Get the path
+        names = [configuration["name"] for configuration in self.bin_configurations]
+        bin_configurations_name = names[torch.argmax(domain_vote, dim=-1).squeeze().item()]
+
+        try:
+            conf = [config for config in self.bin_configurations if config["name"] == bin_configurations_name][0]
+        except IndexError:
+            raise ValueError(f"bin_configurations_name {bin_configurations_name} not found in bin_configurationss")
+
+        min_depth = conf["min_depth"]
+        max_depth = conf["max_depth"]
+
+        seed_bin_regressor = self.seed_bin_regressors[bin_configurations_name]
+        _, seed_bin_centers = seed_bin_regressor(x)
+        if self.bin_centers_type in ["normed", "hybrid2"]:
+            prev_bin = (seed_bin_centers - min_depth) / (max_depth - min_depth)
+        else:
+            prev_bin = seed_bin_centers
+        prev_bin_embedding = self.seed_projector(x)
+
+        attractors = self.attractors[bin_configurations_name]
+        for projector, attractor, feature in zip(self.projectors, attractors, feature_blocks):
+            bin_embedding = projector(feature)
+            bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
+            prev_bin = bin
+            prev_bin_embedding = bin_embedding
+
+        last = outconv_activation
+
+        bin_centers = nn.functional.interpolate(bin_centers, last.shape[-2:], mode="bilinear", align_corners=True)
+        bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
+
+        conditional_log_binomial = self.conditional_log_binomial[bin_configurations_name]
+        x = conditional_log_binomial(last, bin_embedding)
+
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        out = torch.sum(x * bin_centers, dim=1, keepdim=True)
+
+        return out, domain_logits
+
+
+class ZoeDepthMetricDepthEstimationHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        bin_configuration = config.bin_configurations[0]
+        n_bins = bin_configuration["n_bins"]
+        min_depth = bin_configuration["min_depth"]
+        max_depth = bin_configuration["max_depth"]
+        bin_embedding_dim = config.bin_embedding_dim
+        n_attractors = config.num_attractors
+        bin_centers_type = config.bin_centers_type
+
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.bin_centers_type = bin_centers_type
+
+        # Bottleneck convolution
+        bottleneck_features = config.bottleneck_features
+        self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
+
+        # Regressor and attractor
+        if self.bin_centers_type == "normed":
+            Attractor = ZoeDepthAttractorLayer
+        elif self.bin_centers_type == "softplus":
+            Attractor = ZoeDepthAttractorLayerUnnormed
+
+        self.seed_bin_regressor = ZoeDepthSeedBinRegressor(
+            config, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth
+        )
+        self.seed_projector = ZoeDepthProjector(in_features=bottleneck_features, out_features=bin_embedding_dim)
+
+        self.projectors = nn.ModuleList(
+            [
+                ZoeDepthProjector(in_features=config.fusion_hidden_size, out_features=bin_embedding_dim)
+                for _ in range(4)
+            ]
+        )
+        self.attractors = nn.ModuleList(
+            [
+                Attractor(
+                    config,
+                    n_bins=n_bins,
+                    n_attractors=n_attractors[i],
+                    min_depth=min_depth,
+                    max_depth=max_depth,
+                )
+                for i in range(4)
+            ]
+        )
+
+        last_in = config.num_relative_features + 1  # +1 for relative depth
+
+        # use log binomial instead of softmax
+        self.conditional_log_binomial = ZoeDepthConditionalLogBinomialSoftmax(
+            config,
+            last_in,
+            bin_embedding_dim,
+            n_classes=n_bins,
+        )
+
+    def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
+        x = self.conv2(bottleneck)
+        _, seed_bin_centers = self.seed_bin_regressor(x)
+
+        if self.bin_centers_type in ["normed", "hybrid2"]:
+            prev_bin = (seed_bin_centers - self.min_depth) / (self.max_depth - self.min_depth)
+        else:
+            prev_bin = seed_bin_centers
+
+        prev_bin_embedding = self.seed_projector(x)
+
+        # unroll this loop for better performance
+        for projector, attractor, feature in zip(self.projectors, self.attractors, feature_blocks):
+            bin_embedding = projector(feature)
+            bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
+            prev_bin = bin.clone()
+            prev_bin_embedding = bin_embedding.clone()
+
+        last = outconv_activation
+
+        # concatenative relative depth with last. First interpolate relative depth to last size
+        relative_conditioning = relative_depth.unsqueeze(1)
+        relative_conditioning = nn.functional.interpolate(
+            relative_conditioning, size=last.shape[2:], mode="bilinear", align_corners=True
+        )
+        last = torch.cat([last, relative_conditioning], dim=1)
+
+        bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
+        x = self.conditional_log_binomial(last, bin_embedding)
+
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        bin_centers = nn.functional.interpolate(bin_centers, x.shape[-2:], mode="bilinear", align_corners=True)
+        out = torch.sum(x * bin_centers, dim=1, keepdim=True)
+
+        return out, None
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth
+class ZoeDepthPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ZoeDepthConfig
+    base_model_prefix = "zoedepth"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ZOEDEPTH_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ZOEDEPTH_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """
+    ZoeDepth model with one or multiple metric depth estimation head(s) on top.
+    """,
+    ZOEDEPTH_START_DOCSTRING,
+)
+class ZoeDepthForDepthEstimation(ZoeDepthPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.backbone = load_backbone(config)
+
+        if hasattr(self.backbone.config, "hidden_size") and hasattr(self.backbone.config, "patch_size"):
+            config.backbone_hidden_size = self.backbone.config.hidden_size
+            self.patch_size = self.backbone.config.patch_size
+        else:
+            raise ValueError(
+                "ZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributes"
+            )
+
+        self.neck = ZoeDepthNeck(config)
+        self.relative_head = ZoeDepthRelativeDepthEstimationHead(config)
+
+        self.metric_head = (
+            ZoeDepthMultipleMetricDepthEstimationHeads(config)
+            if len(config.bin_configurations) > 1
+            else ZoeDepthMetricDepthEstimationHead(config)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ZOEDEPTH_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+        >>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     predicted_depth = outputs.predicted_depth
+
+        >>> # interpolate to original size
+        >>> prediction = torch.nn.functional.interpolate(
+        ...     predicted_depth.unsqueeze(1),
+        ...     size=image.size[::-1],
+        ...     mode="bicubic",
+        ...     align_corners=False,
+        ... )
+
+        >>> # visualize the prediction
+        >>> output = prediction.squeeze().cpu().numpy()
+        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
+        >>> depth = Image.fromarray(formatted)
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        hidden_states = outputs.feature_maps
+
+        _, _, height, width = pixel_values.shape
+        patch_size = self.patch_size
+        patch_height = height // patch_size
+        patch_width = width // patch_size
+
+        hidden_states, features = self.neck(hidden_states, patch_height, patch_width)
+
+        out = [features] + hidden_states
+
+        relative_depth, features = self.relative_head(hidden_states)
+
+        out = [features] + out
+
+        metric_depth, domain_logits = self.metric_head(
+            outconv_activation=out[0], bottleneck=out[1], feature_blocks=out[2:], relative_depth=relative_depth
+        )
+        metric_depth = metric_depth.squeeze(dim=1)
+
+        if not return_dict:
+            if domain_logits is not None:
+                output = (metric_depth, domain_logits) + outputs[1:]
+            else:
+                output = (metric_depth,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return ZoeDepthDepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=metric_depth,
+            domain_logits=domain_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index a462e3d8240099..0ca5d36d0f40e3 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -519,7 +519,7 @@ def scheduler_hook(param):
             if param.requires_grad:
                 param.register_post_accumulate_grad_hook(scheduler_hook)
 
-        return LayerWiseDummyScheduler()
+        return LayerWiseDummyScheduler(optimizer_dict=optimizer_dict, lr=optimizer.defaults["lr"])
 
     if name == SchedulerType.CONSTANT:
         return schedule_func(optimizer)
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index c6431a499717a4..79a85008e7cf99 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -91,6 +91,8 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout)
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 64714390b04f1d..c840c14a7191fa 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -294,7 +294,10 @@ def preprocess(
         if input.get("image", None) is not None:
             image = load_image(input["image"], timeout=timeout)
             if self.image_processor is not None:
-                image_features.update(self.image_processor(images=image, return_tensors=self.framework))
+                image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    image_inputs = image_inputs.to(self.torch_dtype)
+                image_features.update(image_inputs)
             elif self.feature_extractor is not None:
                 image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
             elif self.model_type == ModelType.VisionEncoderDecoder:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index bfa005f06babad..c54f372baa9d05 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -161,6 +161,8 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 3a361deabd797d..391eb2b3aec714 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -60,6 +60,8 @@ def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None,
     def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 23fbd4fb79b190..e0fd3b7d85ab19 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -147,6 +147,8 @@ def preprocess(self, image, subtask=None, timeout=None):
             else:
                 kwargs = {"task_inputs": [subtask]}
             inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
             inputs["task_inputs"] = self.tokenizer(
                 inputs["task_inputs"],
                 padding="max_length",
@@ -155,6 +157,8 @@ def preprocess(self, image, subtask=None, timeout=None):
             )["input_ids"]
         else:
             inputs = self.image_processor(images=[image], return_tensors="pt")
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
         inputs["target_size"] = target_size
         return inputs
 
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index 8c34ee8dd3c80c..cb66359a4dddea 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -119,6 +119,8 @@ def _forward(self, model_inputs):
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
         return inputs
 
     def postprocess(self, model_outputs):
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 4a9a3744d841a0..88dce8e591ae41 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -138,6 +138,8 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             if model_type == "git":
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
                 input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
                 input_ids = [self.tokenizer.cls_token_id] + input_ids
                 input_ids = torch.tensor(input_ids).unsqueeze(0)
@@ -145,10 +147,14 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             elif model_type == "pix2struct":
                 model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
 
             elif model_type != "vision-encoder-decoder":
                 # vision-encoder-decoder does not support conditional generation
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
                 text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
                 model_inputs.update(text_inputs)
 
@@ -157,6 +163,8 @@ def preprocess(self, image, prompt=None, timeout=None):
 
         else:
             model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+            if self.framework == "pt":
+                model_inputs = model_inputs.to(self.torch_dtype)
 
         if self.model.config.model_type == "git" and prompt is None:
             model_inputs["input_ids"] = None
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index 68d407aff2d4e4..f87e45b7f8ecb4 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -181,6 +181,8 @@ def preprocess(
             image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
         )
         model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
 
         with self.device_placement():
             if self.framework == "pt":
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 36946cbf8a4511..d3e2135790ff8c 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -107,6 +107,8 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
         if self.tokenizer is not None:
             inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
         inputs["target_size"] = target_size
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index c2dce89dd701be..80f59bf425a2dc 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -266,31 +266,33 @@ def preprocess(
         prompt_text,
         prefix="",
         handle_long_generation=None,
-        add_special_tokens=False,
+        add_special_tokens=None,
         truncation=None,
-        padding=False,
+        padding=None,
         max_length=None,
         **generate_kwargs,
     ):
+        # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
+        tokenizer_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "truncation": truncation,
+            "padding": padding,
+            "max_length": max_length,
+        }
+        tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None}
+
         if isinstance(prompt_text, Chat):
+            tokenizer_kwargs.pop("add_special_tokens", None)  # ignore add_special_tokens on chats
             inputs = self.tokenizer.apply_chat_template(
                 prompt_text.messages,
-                truncation=truncation,
-                padding=padding,
-                max_length=max_length,
                 add_generation_prompt=True,
                 return_dict=True,
                 return_tensors=self.framework,
+                **tokenizer_kwargs,
             )
         else:
-            inputs = self.tokenizer(
-                prefix + prompt_text,
-                truncation=truncation,
-                padding=padding,
-                max_length=max_length,
-                add_special_tokens=add_special_tokens,
-                return_tensors=self.framework,
-            )
+            inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs)
+
         inputs["prompt_text"] = prompt_text
 
         if handle_long_generation == "hole":
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 5702f23c5f6090..68ea928bce5672 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -106,6 +106,8 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
         video = list(video)
 
         model_inputs = self.image_processor(video, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 9455b0d85928a4..e5849cbdec1955 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -155,6 +155,8 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
             truncation=truncation,
         )
         image_features = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            image_features = image_features.to(self.torch_dtype)
         model_inputs.update(image_features)
         return model_inputs
 
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index c3606e3c2b83df..d9109aebd9c529 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -121,6 +121,8 @@ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is
         inputs = self.feature_extractor(
             [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
         )
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 8e40d0e6a5cbfa..b0ceba8cbe678d 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -120,6 +120,8 @@ def _sanitize_parameters(self, **kwargs):
     def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         padding = "max_length" if self.model.config.model_type == "siglip" else True
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 5be89332cbd910..9ad575202266ee 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -156,6 +156,8 @@ def preprocess(self, inputs, timeout=None):
         for i, candidate_label in enumerate(candidate_labels):
             text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
             image_features = self.image_processor(image, return_tensors=self.framework)
+            if self.framework == "pt":
+                image_features = image_features.to(self.torch_dtype)
             yield {
                 "is_last": i == len(candidate_labels) - 1,
                 "target_size": target_size,
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 796e2caf347958..6d2e7f502e0089 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -800,7 +800,7 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
         # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
         # into a HalfTensor
         if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
+            self.data = {k: v.to(device=device) for k, v in self.data.items() if v is not None}
         else:
             logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
         return self
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index affc7b725e8a70..4119e547a37616 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -221,6 +221,11 @@
         DistributedDataParallelKwargs,
         DistributedType,
         GradientAccumulationPlugin,
+        is_mlu_available,
+        is_mps_available,
+        is_npu_available,
+        is_torch_version,
+        is_xpu_available,
         load_fsdp_model,
         load_fsdp_optimizer,
         save_fsdp_model,
@@ -2422,7 +2427,7 @@ def _inner_training_loop(
             for checkpoint in checkpoints_sorted:
                 if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                     logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
-                    shutil.rmtree(checkpoint)
+                    shutil.rmtree(checkpoint, ignore_errors=True)
 
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
@@ -3307,6 +3312,20 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
             loss = self.compute_loss(model, inputs)
 
         del inputs
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            if is_xpu_available():
+                torch.xpu.empty_cache()
+            elif is_mlu_available():
+                torch.mlu.empty_cache()
+            elif is_npu_available():
+                torch.npu.empty_cache()
+            elif is_torch_version(">=", "2.0") and is_mps_available():
+                torch.mps.empty_cache()
+            else:
+                torch.cuda.empty_cache()
 
         kwargs = {}
 
@@ -3651,6 +3670,8 @@ def evaluate(
         total_batch_size = self.args.eval_batch_size * self.args.world_size
         if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
             start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
         output.metrics.update(
             speed_metrics(
                 metric_key_prefix,
@@ -3720,6 +3741,8 @@ def predict(
         total_batch_size = self.args.eval_batch_size * self.args.world_size
         if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
             start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
         output.metrics.update(
             speed_metrics(
                 metric_key_prefix,
@@ -3758,11 +3781,13 @@ def evaluation_loop(
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
         if len(self.accelerator._models) == 0 and model is self.model:
+            start_time = time.time()
             model = (
                 self.accelerator.prepare(model)
                 if self.is_deepspeed_enabled
                 else self.accelerator.prepare_model(model, evaluation_mode=True)
             )
+            self.model_preparation_time = round(time.time() - start_time, 4)
 
             if self.is_fsdp_enabled:
                 self.model = model
@@ -3839,6 +3864,9 @@ def evaluation_loop(
                 inputs_decode = self.gather_function((inputs_decode))
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_inputs.add(inputs_decode)
+            if labels is not None:
+                # Pad labels here, preparing for preprocess_logits_for_metrics in next logits block.
+                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
             if logits is not None:
                 logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
                 if self.preprocess_logits_for_metrics is not None:
@@ -3847,7 +3875,6 @@ def evaluation_loop(
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_preds.add(logits)
             if labels is not None:
-                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
                 labels = self.gather_function((labels))
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_labels.add(labels)
@@ -3933,6 +3960,8 @@ def evaluation_loop(
             metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
         if hasattr(self, "jit_compilation_time"):
             metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
+        if hasattr(self, "model_preparation_time"):
+            metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time
 
         # Prefix all keys with metric_key_prefix + '_'
         for key in list(metrics.keys()):
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 856ba4f664d3a7..fcffcd3595be38 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -27,6 +27,7 @@
 from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from itertools import chain
 from logging import StreamHandler
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -1379,13 +1380,24 @@ class LayerWiseDummyScheduler(LRScheduler):
     """
 
     def __init__(self, *args, **kwargs):
-        optimizer = LayerWiseDummyOptimizer()
+        self.default_lr = kwargs["lr"]
+        optimizer = LayerWiseDummyOptimizer(**kwargs)
         last_epoch = -1
         verbose = False
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        return [group["lr"] for group in self.optimizer.param_groups]
+        # default value
+        lrs = [self.default_lr]
+
+        # we take each lr in the parameters if they exist, assumes the optimizer to be the `LayerWiseDummyOptimizer`
+        if self.optimizer is not None:
+            param_wise_lrs = [
+                [group["lr"] for group in optim.param_groups] for optim in self.optimizer.optimizer_dict.values()
+            ]
+            lrs = list(chain(*param_wise_lrs))
+
+        return lrs
 
     def _get_closed_form_lr(self):
         return self.base_lrs
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 5eff032774e203..6d68405ab35a24 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -48,6 +48,7 @@
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_mlu_available,
+    is_torch_mps_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_tf32_available,
@@ -267,6 +268,15 @@ class TrainingArguments:
         eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
             eval_strategy.
+        torch_empty_cache_steps (`int`, *optional*):
+            Number of steps to wait before calling `torch.<device>.empty_cache()`. If left unset or set to None, cache will not be emptied.
+
+            <Tip>
+
+            This can help avoid CUDA out-of-memory errors by lowering peak VRAM usage at a cost of about [10% slower performance](https://github.com/huggingface/transformers/issues/31372).
+
+            </Tip>
+
         learning_rate (`float`, *optional*, defaults to 5e-5):
             The initial learning rate for [`AdamW`] optimizer.
         weight_decay (`float`, *optional*, defaults to 0):
@@ -427,8 +437,9 @@ class TrainingArguments:
             use the corresponding output (usually index 2) as the past state and feed it to the model at the next
             training step under the keyword argument `mems`.
         run_name (`str`, *optional*, defaults to `output_dir`):
-            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
-            [mlflow](https://www.mlflow.org/) logging. If not specified, will be the same as `output_dir`.
+            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
+            [mlflow](https://www.mlflow.org/) and [comet](https://www.comet.com/site) logging. If not specified, will
+            be the same as `output_dir`.
         disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
             [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -851,6 +862,15 @@ class TrainingArguments:
         },
     )
 
+    torch_empty_cache_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of steps to wait before calling `torch.<device>.empty_cache()`."
+            "This can help avoid CUDA out-of-memory errors by lowering peak VRAM usage at a cost of about [10% slower performance](https://github.com/huggingface/transformers/issues/31372)."
+            "If left unset or set to None, cache will not be emptied."
+        },
+    )
+
     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
     weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
     adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
@@ -1131,7 +1151,8 @@ class TrainingArguments:
     )
 
     run_name: Optional[str] = field(
-        default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
+        default=None,
+        metadata={"help": "An optional descriptor for the run. Notably used for wandb, mlflow and comet logging."},
     )
     disable_tqdm: Optional[bool] = field(
         default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
@@ -1532,6 +1553,12 @@ def __post_init__(self):
         if self.do_eval is False and self.eval_strategy != IntervalStrategy.NO:
             self.do_eval = True
 
+        if self.torch_empty_cache_steps is not None:
+            if not (isinstance(self.torch_empty_cache_steps, int) or self.torch_empty_cache_steps > 0):
+                raise ValueError(
+                    f"`torch_empty_cache_steps` must be an integer bigger than 0, got {self.torch_empty_cache_steps}."
+                )
+
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
         if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
             if self.logging_steps > 0:
@@ -2152,6 +2179,8 @@ def _setup_devices(self) -> "torch.device":
                     )
             if self.use_cpu:
                 device = torch.device("cpu")
+            elif is_torch_mps_available():
+                device = torch.device("mps")
             elif is_torch_xpu_available():
                 if not is_ipex_available() and not is_accelerate_available("0.32.0.dev"):
                     raise ImportError("Using the XPU PyTorch backend requires `accelerate>=0.32.0.dev`")
@@ -2170,7 +2199,9 @@ def _setup_devices(self) -> "torch.device":
                 # trigger an error that a device index is missing. Index 0 takes into account the
                 # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
                 # will use the first GPU in that env, i.e. GPU#1
-                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                device = torch.device(
+                    "cuda:0" if torch.cuda.is_available() else os.environ.get("ACCELERATE_TORCH_DEVICE", "cpu")
+                )
                 # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
                 # the default value.
                 self._n_gpu = torch.cuda.device_count()
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 12a6c5afe926bf..9df53c3f1d6161 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
             Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
             automatically detect from metadata.
         run_name (`str`, *optional*):
-            A descriptor for the run. Notably used for wandb logging.
+            A descriptor for the run. Notably used for wandb, mlflow and comet logging.
         xla (`bool`, *optional*):
             Whether to activate the XLA compilation or not.
     """
diff --git a/src/transformers/utils/chat_template_utils.py b/src/transformers/utils/chat_template_utils.py
index ee6173f2a1532b..414d2fb72454c5 100644
--- a/src/transformers/utils/chat_template_utils.py
+++ b/src/transformers/utils/chat_template_utils.py
@@ -80,7 +80,7 @@ def _parse_type_hint(hint: str) -> Dict:
             return_dict = subtypes[0]
         elif all(isinstance(subtype["type"], str) for subtype in subtypes):
             # A union of basic types can be expressed as a list in the schema
-            return_dict = {"type": [subtype["type"] for subtype in subtypes]}
+            return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
         else:
             # A union of more complex types requires "anyOf"
             return_dict = {"anyOf": subtypes}
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c9267debc5de81..edc4c95b1a35ed 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -37,6 +37,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class EncoderDecoderCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class HQQQuantizedCache(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9653,6 +9660,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ZoeDepthForDepthEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ZoeDepthPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Adafactor(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 359c5481757d67..9d5175ed2aeab9 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -651,3 +651,10 @@ class YolosImageProcessor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
+
+
+class ZoeDepthImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index c3687c035c5837..0aa296e7055f0b 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -995,6 +995,13 @@ def _generate_dummy_input(
             inputs_dict[input_name] = torch.zeros(
                 *shape, model.config.input_feat_per_channel, dtype=torch.float, device=device
             )
+        elif "inputs_embeds" in input_name:
+            batch_size = shape[0]
+            sequence_length = shape[-1]
+
+            inputs_dict[input_name] = torch.zeros(
+                batch_size, sequence_length, model.config.hidden_size, dtype=torch.float, device=device
+            )
         elif "visual_feats" in input_name:
             inputs_dict[input_name] = torch.zeros(
                 shape
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 80232898ce4707..01c5ede34ae83e 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -762,7 +762,7 @@ def torch_int(x):
 
     import torch
 
-    return x.to(torch.int64) if torch.jit.is_tracing() else int(x)
+    return x.to(torch.int64) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
 
 
 def torch_float(x):
@@ -774,7 +774,7 @@ def torch_float(x):
 
     import torch
 
-    return x.to(torch.float32) if torch.jit.is_tracing() else int(x)
+    return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
 
 
 def filter_out_non_signature_kwargs(extra: Optional[list] = None):
diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py
index 062b98abd47350..5bdaea1651b741 100644
--- a/tests/agents/test_agents.py
+++ b/tests/agents/test_agents.py
@@ -94,12 +94,48 @@ def fake_react_code_llm_error(messages, stop_sequences=None) -> str:
 """
 
 
+def fake_react_code_functiondef(messages, stop_sequences=None) -> str:
+    prompt = str(messages)
+    if "special_marker" not in prompt:
+        return """
+Thought: Let's define the function. special_marker
+Code:
+```py
+import numpy as np
+
+def moving_average(x, w):
+    return np.convolve(x, np.ones(w), 'valid') / w
+```<end_code>
+"""
+    else:  # We're at step 2
+        return """
+Thought: I can now answer the initial question
+Code:
+```py
+x, w = [0, 1, 2, 3, 4, 5], 2
+res = moving_average(x, w)
+final_answer(res)
+```<end_code>
+"""
+
+
 def fake_code_llm_oneshot(messages, stop_sequences=None) -> str:
     return """
 Thought: I should multiply 2 by 3.6452. special_marker
 Code:
 ```py
 result = python_interpreter(code="2*3.6452")
+final_answer(result)
+```
+"""
+
+
+def fake_code_llm_no_return(messages, stop_sequences=None) -> str:
+    return """
+Thought: I should multiply 2 by 3.6452. special_marker
+Code:
+```py
+result = python_interpreter(code="2*3.6452")
 print(result)
 ```
 """
@@ -135,8 +171,8 @@ def test_fake_react_json_agent(self):
     def test_fake_react_code_agent(self):
         agent = ReactCodeAgent(tools=[PythonInterpreterTool()], llm_engine=fake_react_code_llm)
         output = agent.run("What is 2 multiplied by 3.6452?")
-        assert isinstance(output, AgentText)
-        assert output == "7.2904"
+        assert isinstance(output, float)
+        assert output == 7.2904
         assert agent.logs[0]["task"] == "What is 2 multiplied by 3.6452?"
         assert float(agent.logs[1]["observation"].strip()) - 12.511648 < 1e-6
         assert agent.logs[2]["tool_call"] == {
@@ -157,7 +193,7 @@ def test_setup_agent_with_empty_toolbox(self):
     def test_react_fails_max_iterations(self):
         agent = ReactCodeAgent(
             tools=[PythonInterpreterTool()],
-            llm_engine=fake_code_llm_oneshot,  # use this callable because it never ends
+            llm_engine=fake_code_llm_no_return,  # use this callable because it never ends
             max_iterations=5,
         )
         agent.run("What is 2 multiplied by 3.6452?")
@@ -192,3 +228,10 @@ def test_init_agent_with_different_toolsets(self):
         # check that python_interpreter base tool does not get added to code agents
         agent = ReactCodeAgent(tools=[], llm_engine=fake_react_code_llm, add_base_tools=True)
         assert len(agent.toolbox.tools) == 6  # added final_answer tool + 5 base tools (excluding interpreter)
+
+    def test_function_persistence_across_steps(self):
+        agent = ReactCodeAgent(
+            tools=[], llm_engine=fake_react_code_functiondef, max_iterations=2, additional_authorized_imports=["numpy"]
+        )
+        res = agent.run("ok")
+        assert res[0] == 0.5
diff --git a/tests/agents/test_python_interpreter.py b/tests/agents/test_python_interpreter.py
index 6f5907e27be1f0..8843a394b35313 100644
--- a/tests/agents/test_python_interpreter.py
+++ b/tests/agents/test_python_interpreter.py
@@ -660,7 +660,6 @@ def add_one(n, shift):
 """
         state = {}
         result = evaluate_python_code(code, {"print": print, "range": range, "ord": ord, "chr": chr}, state=state)
-        print(state)
         assert result == 2
 
         # test returning None
@@ -672,5 +671,4 @@ def returns_none(a):
 """
         state = {}
         result = evaluate_python_code(code, {"print": print, "range": range, "ord": ord, "chr": chr}, state=state)
-        print(state)
         assert result is None
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 3293cc279d019a..cab6fe8d094cd6 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -30,7 +30,9 @@
     require_auto_gptq,
     require_quanto,
     require_torch,
+    require_torch_gpu,
     require_torch_multi_accelerator,
+    require_torch_multi_gpu,
     slow,
     torch_device,
 )
@@ -57,7 +59,7 @@
         ImageGPTForCausalImageModeling,
         SpeechEncoderDecoderModel,
     )
-    from transformers.cache_utils import DynamicCache, QuantoQuantizedCache
+    from transformers.cache_utils import DynamicCache, EncoderDecoderCache, QuantoQuantizedCache
     from transformers.generation import (
         BeamSampleDecoderOnlyOutput,
         BeamSampleEncoderDecoderOutput,
@@ -1262,6 +1264,55 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
             for output in (output_greedy, output_prompt_lookup):
                 self._check_outputs(output, input_ids, model.config, use_cache=True)
 
+    def test_dola_decoding_sample(self):
+        # TODO (joao): investigate skips, try to reduce incompatibilities
+        for model_class in self.all_generative_model_classes:
+            if model_class._is_stateful:
+                self.skipTest(reason="Stateful models don't support DoLa decoding")
+
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
+                self.skipTest("Skip Reformer as the lm_head input size is 2 * hidden size, adopted from Rev Nets.")
+
+            if any(model_name in model_class.__name__.lower() for model_name in ["marian", "mbart", "pegasus"]):
+                self.skipTest("DoLa is not supported for models that don't return layerwise hidden states")
+
+            # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
+            config, input_ids, attention_mask = self._get_input_ids_and_config()
+
+            # Some models don't support the cache and returning past_key_values
+            if not hasattr(config, "use_cache"):
+                config.use_cache = False
+            else:
+                config.use_cache = True
+
+            # Encoder-decoder models are not supported
+            if config.is_encoder_decoder:
+                self.skipTest("DoLa is not supported for encoder-decoder models")
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+
+            if model.get_output_embeddings() is None:
+                self.skipTest("DoLa is not supported for models that don't have output embeddings")
+            # Sets dola generation arguments such that:
+            # a) no EOS is generated, to ensure generation doesn't break early
+            # b) there are at least two forward passes in the main model, to ensure the input preparation of
+            #    the main model is correct
+            generation_kwargs = {
+                "eos_token_id": -1,  # see a)
+                "max_new_tokens": 4,  # see b)
+                "num_beams": 1,
+                "do_sample": True,
+                "output_scores": True,
+                "output_logits": True,
+                "output_hidden_states": True,
+                "output_attentions": self.has_attentions,
+                "return_dict_in_generate": True,
+            }
+            generation_kwargs.update({"dola_layers": "low"})
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs)
+            self._check_outputs(output_dola, input_ids, model.config, use_cache=config.use_cache)
+
     def test_assisted_decoding_sample(self):
         # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
         # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
@@ -1636,7 +1687,6 @@ def test_new_cache_format(self, num_beams, do_sample):
 
             config, input_ids, attention_mask = self._get_input_ids_and_config()
             config.use_cache = True
-            config.is_decoder = True
 
             model = model_class(config).to(torch_device).eval()
             generation_kwargs = {
@@ -1652,15 +1702,21 @@ def test_new_cache_format(self, num_beams, do_sample):
             set_seed(seed)
             legacy_results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
             set_seed(seed)
+            if config.is_encoder_decoder:
+                cache_cls = EncoderDecoderCache
+                past_key_values = cache_cls(DynamicCache(), DynamicCache())
+            else:
+                cache_cls = DynamicCache
+                past_key_values = cache_cls()
             new_results = model.generate(
-                input_ids, attention_mask=attention_mask, past_key_values=DynamicCache(), **generation_kwargs
+                input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **generation_kwargs
             )
 
             # The two sets of generated sequences must match, despite the cache format between forward passes being
             # different
             self.assertListEqual(legacy_results.sequences.tolist(), new_results.sequences.tolist())
             self.assertTrue(isinstance(legacy_results.past_key_values, tuple))
-            self.assertTrue(isinstance(new_results.past_key_values, DynamicCache))
+            self.assertTrue(isinstance(new_results.past_key_values, cache_cls))
 
             # The contents of the two caches, when converted to the same format (in both directions!), must match
             legacy_cache = legacy_results.past_key_values
@@ -1675,7 +1731,7 @@ def test_new_cache_format(self, num_beams, do_sample):
                     )
 
             new_cache = new_results.past_key_values
-            legacy_cache_converted = DynamicCache.from_legacy_cache(legacy_results.past_key_values)
+            legacy_cache_converted = cache_cls.from_legacy_cache(legacy_results.past_key_values)
             for layer_idx in range(len(new_cache)):
                 for kv_idx in range(len(new_cache[layer_idx])):
                     self.assertTrue(
@@ -2082,6 +2138,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
             [1, 18],
         )
 
+    # TODO (joao): replace `stop_sequence` in the pipeline by the more recent `generate` functionality
     def test_stop_sequence_stopping_criteria(self):
         # PT-only test: TF doesn't have StoppingCriteria
         prompt = """Hello I believe in"""
@@ -2089,17 +2146,11 @@ def test_stop_sequence_stopping_criteria(self):
         output = generator(prompt)
         self.assertEqual(
             output,
-            [
-                {
-                    "generated_text": (
-                        "Hello I believe in in in number number number number number number number number number"
-                    )
-                }
-            ],
+            [{"generated_text": ("Hello I believe in we we we we we we we we we")}],
         )
 
-        output = generator(prompt, stop_sequence=" number")
-        self.assertEqual(output, [{"generated_text": "Hello I believe in in in number"}])
+        output = generator(prompt, stop_sequence=" we")
+        self.assertEqual(output, [{"generated_text": "Hello I believe in we"}])
 
     def test_generate_non_nlp_input_ids_as_kwarg(self):
         # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
@@ -3097,6 +3148,54 @@ def test_return_unprocessed_logit_scores(self):
         self.assertTrue(y_prob > 0.001 and n_prob > 0.001)
         self.assertTrue(y_prob <= 1.0 and n_prob <= 1.0)
 
+    @slow
+    @require_torch_multi_gpu
+    def test_assisted_decoding_in_different_gpu(self):
+        # PT-only test: TF doesn't support assisted decoding yet.
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda:0")
+        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
+            "cuda:1"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+        model.config.pad_token_id = tokenizer.eos_token_id
+        assistant.config.pad_token_id = tokenizer.eos_token_id
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+        input_length = input_ids.shape[-1]
+
+        out = model.generate(
+            input_ids,
+            assistant_model=assistant,
+            max_new_tokens=20,
+        )
+        self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
+
+    @slow
+    @require_torch_gpu
+    def test_assisted_decoding_in_gpu_cpu(self):
+        # PT-only test: TF doesn't support assisted decoding yet.
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda")
+        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
+            "cpu"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+        model.config.pad_token_id = tokenizer.eos_token_id
+        assistant.config.pad_token_id = tokenizer.eos_token_id
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+        input_length = input_ids.shape[-1]
+
+        out = model.generate(
+            input_ids,
+            assistant_model=assistant,
+            max_new_tokens=20,
+        )
+        self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index 0c699f3fa61fd3..023325ce424155 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -215,6 +215,12 @@ def test_model_from_pretrained(self):
     def test_pipeline_image_feature_extraction(self):
         super().test_pipeline_image_feature_extraction()
 
+    @is_pipeline_test
+    @require_vision
+    @slow
+    def test_pipeline_image_feature_extraction_fp16(self):
+        super().test_pipeline_image_feature_extraction_fp16()
+
     @is_pipeline_test
     @require_vision
     @slow
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index c7fb55f682ed0e..373885f535bd73 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -542,7 +542,7 @@ def setUpClass(cls):
 
     @require_read_token
     def test_model_2b_fp16(self):
-        model_id = "google/gemma-2-9b"
+        model_id = "google/gemma-2b"
         EXPECTED_TEXTS = [
             "Hello I am doing a project on the 1990s and I need to know what the most popular music",
             "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
@@ -607,8 +607,8 @@ def test_model_2b_eager(self):
         # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: [
-                "Hello I am doing a project on the 1990s and I am looking for some information on the ",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
             ],
             8: [
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
@@ -733,6 +733,9 @@ def test_model_7b_fp32(self):
 
     @require_read_token
     def test_model_7b_fp16(self):
+        if self.cuda_compute_capability_major_version == 7:
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+
         model_id = "google/gemma-7b"
         EXPECTED_TEXTS = [
             """Hello I am doing a project on a 1999 4.0L 4x4. I""",
@@ -753,6 +756,9 @@ def test_model_7b_fp16(self):
 
     @require_read_token
     def test_model_7b_bf16(self):
+        if self.cuda_compute_capability_major_version == 7:
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+
         model_id = "google/gemma-7b"
 
         # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
@@ -788,6 +794,9 @@ def test_model_7b_bf16(self):
 
     @require_read_token
     def test_model_7b_fp16_static_cache(self):
+        if self.cuda_compute_capability_major_version == 7:
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+
         model_id = "google/gemma-7b"
         EXPECTED_TEXTS = [
             """Hello I am doing a project on a 1999 4.0L 4x4. I""",
@@ -815,7 +824,7 @@ def test_model_7b_4bit(self):
         EXPECTED_TEXTS = {
             7: [
                 "Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
-                """Hi today I am going to talk about the new update for the game called "The new update" and I""",
+                "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
             ],
             8: [
                 "Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
@@ -830,7 +839,6 @@ def test_model_7b_4bit(self):
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
         self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
 
     @slow
@@ -889,3 +897,24 @@ def test_compile_static_cache(self):
         )
         static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
+
+    def test_model_2b_bf16_dola(self):
+        model_id = "google/gemma-2b"
+        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
+        EXPECTED_TEXTS = [
+            "Hello I am doing an experiment and need to get the mass of a block. The problem is, it has no scale",
+            "Hi today we have the review for a <strong>2016/2017</strong> season of",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(
+            **inputs, max_new_tokens=20, do_sample=False, dola_layers="low", repetition_penalty=1.2
+        )
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index 870265f9460f7b..20b8ea3ec5c825 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config, is_torch_available
+from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config, is_torch_available, pipeline
 from transformers.testing_utils import (
     require_read_token,
     require_torch,
@@ -102,41 +102,62 @@ def setUpClass(cls):
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
     @require_read_token
-    def test_model_2b_bf16(self):
+    def test_model_9b_bf16(self):
         model_id = "google/gemma-2-9b"
         EXPECTED_TEXTS = [
-            "<bos>Hello I am doing a project for a class and I am trying to use the <code><a-image></code>",
-            "<pad><pad><bos>Hi today. So, I'm going to show you how to do a problem from the textbook. So",
+            "<bos>Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+            "<pad><pad><bos>Hi today I'm going to be talking about the history of the United States. The United States of America",
         ]
 
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
+        ).to(torch_device)
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
 
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @require_read_token
-    def test_model_2b_fp16(self):
+    def test_model_9b_fp16(self):
         model_id = "google/gemma-2-9b"
         EXPECTED_TEXTS = [
-            "<bos>Hello I am doing a project on the effect of the temperature on the rate of a reaction. I am using a ",
-            "<pad><pad><bos>Hi today I'm going to be talking about the 1000-4000-",
+            "<bos>Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+            "<pad><pad><bos>Hi today I'm going to be talking about the history of the United States. The United States of America",
         ]
 
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
+        ).to(torch_device)
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
 
         self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @require_read_token
+    def test_model_9b_pipeline_bf16(self):
+        # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Gemma2 before this PR
+        model_id = "google/gemma-2-9b"
+        # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+            "Hi today I'm going to be talking about the history of the United States. The United States of America",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+        output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True)
+
+        self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0])
+        self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1])
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 0935e802c685b9..3e1d5c26eb1587 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -703,6 +703,29 @@ def test_model_7b_logits(self):
             )
         )
 
+    @slow
+    def test_model_7b_dola_generation(self):
+        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
+        EXPECTED_TEXT_COMPLETION = (
+            "Simply put, the theory of relativity states that 1) time and space are relative, and 2) the laws of "
+            "physics are the same for all observers in uniform motion relative to one another.\n\nThe theory of "
+            "relativity was developed by Albert Einstein in the early 20th century, and it revolutionized our "
+            "understanding of space and time."
+        )
+        prompt = "Simply put, the theory of relativity states that "
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-chat-hf", device_map="sequential", torch_dtype=torch.float16
+        )
+        model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(
+            **model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False, dola_layers="low"
+        )
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
     @slow
     @require_torch_gpu
     @require_read_token
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index f2ba612a47b1c7..34246b8e967a82 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -555,6 +555,30 @@ def test_model_7b_generation(self):
         text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)
 
+    @slow
+    def test_model_7b_dola_generation(self):
+        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
+        EXPECTED_TEXT_COMPLETION = (
+            """My favourite condiment is 100% ketchup. I love it on everything, and I’m not ash"""
+        )
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", torch_dtype=torch.float16
+        )
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(
+            input_ids, max_new_tokens=20, temperature=0, dola_layers="low", repetition_penalty=1.2
+        )
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
     @require_bitsandbytes
     @slow
     @require_flash_attn
diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py
index 2d3d48dba33125..65a417fe56f618 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr.py
@@ -16,6 +16,7 @@
 
 import inspect
 import math
+import tempfile
 import unittest
 
 from parameterized import parameterized
@@ -583,6 +584,11 @@ def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         configs_no_init = _config_zero_init(config)
+        configs_no_init.initializer_bias_prior_prob = 0.2
+        bias_value = -1.3863  # log_e ((1 - 0.2) / 0.2)
+
+        failed_cases = []
+
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
             # Skip the check for the backbone
@@ -593,20 +599,36 @@ def test_initialization(self):
 
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    if (
+                    if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name:
+                        bias_tensor = torch.full_like(param.data, bias_value)
+                        if not torch.allclose(param.data, bias_tensor, atol=1e-4):
+                            failed_cases.append(
+                                f"Parameter {name} of model {model_class} seems not properly initialized. "
+                                f"Biases should be initialized to {bias_value}, got {param.data}"
+                            )
+                    elif (
                         "level_embed" in name
                         or "sampling_offsets.bias" in name
                         or "value_proj" in name
                         or "output_proj" in name
                         or "reference_points" in name
+                        or "enc_score_head.weight" in name
+                        or ("class_embed" in name and "weight" in name)
                         or name in backbone_params
                     ):
                         continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
+                    else:
+                        mean = param.data.mean()
+                        round_mean = (mean * 1e9).round() / 1e9
+                        round_mean = round_mean.item()
+                        if round_mean not in [0.0, 1.0]:
+                            failed_cases.append(
+                                f"Parameter {name} of model {model_class} seems not properly initialized. "
+                                f"Mean is {round_mean}, but should be in [0, 1]"
+                            )
+
+        message = "\n" + "\n".join(failed_cases)
+        self.assertTrue(not failed_cases, message)
 
     @parameterized.expand(["float32", "float16", "bfloat16"])
     @require_torch_gpu
@@ -630,6 +652,48 @@ def test_inference_with_different_dtypes(self, torch_dtype_str):
             with torch.no_grad():
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
+    @parameterized.expand(["float32", "float16", "bfloat16"])
+    @require_torch_gpu
+    @slow
+    def test_inference_equivalence_for_static_and_dynamic_anchors(self, torch_dtype_str):
+        torch_dtype = {
+            "float32": torch.float32,
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+        }[torch_dtype_str]
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        h, w = inputs_dict["pixel_values"].shape[-2:]
+
+        # convert inputs to the desired dtype
+        for key, tensor in inputs_dict.items():
+            if tensor.dtype == torch.float32:
+                inputs_dict[key] = tensor.to(torch_dtype)
+
+        for model_class in self.all_model_classes:
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model_class(config).save_pretrained(tmpdirname)
+                model_static = model_class.from_pretrained(
+                    tmpdirname, anchor_image_size=[h, w], device_map=torch_device, torch_dtype=torch_dtype
+                ).eval()
+                model_dynamic = model_class.from_pretrained(
+                    tmpdirname, anchor_image_size=None, device_map=torch_device, torch_dtype=torch_dtype
+                ).eval()
+
+            self.assertIsNotNone(model_static.config.anchor_image_size)
+            self.assertIsNone(model_dynamic.config.anchor_image_size)
+
+            with torch.no_grad():
+                outputs_static = model_static(**self._prepare_for_class(inputs_dict, model_class))
+                outputs_dynamic = model_dynamic(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertTrue(
+                torch.allclose(
+                    outputs_static.last_hidden_state, outputs_dynamic.last_hidden_state, rtol=1e-4, atol=1e-4
+                ),
+                f"Max diff: {(outputs_static.last_hidden_state - outputs_dynamic.last_hidden_state).abs().max()}",
+            )
+
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index a8e1bb7b0f1264..9d1e3109b313c3 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -18,18 +18,30 @@
 import os
 import tempfile
 import unittest
+from typing import Tuple
 
 import numpy as np
 import requests
+from parameterized import parameterized
+from pytest import mark
 
 from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from transformers.testing_utils import (
+    require_flash_attn,
     require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
 )
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import (
+    is_torch_available,
+    is_torch_bf16_available_on_device,
+    is_torch_fp16_available_on_device,
+    is_torch_sdpa_available,
+    is_vision_available,
+)
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -37,6 +49,7 @@
     _config_zero_init,
     floats_tensor,
     ids_tensor,
+    is_flaky,
     random_attention_mask,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -48,6 +61,8 @@
 
     from transformers import SiglipForImageClassification, SiglipModel, SiglipTextModel, SiglipVisionModel
 
+if is_torch_sdpa_available():
+    from torch.nn.attention import SDPBackend, sdpa_kernel
 
 if is_vision_available():
     from PIL import Image
@@ -55,6 +70,155 @@
     from transformers import SiglipProcessor
 
 
+class SiglipModelTesterMixin(ModelTesterMixin):
+    def test_eager_matches_sdpa_inference(
+        self,
+        torch_dtype: str,
+        use_attention_mask_options: Tuple[bool, ...] = (True, False),
+        logit_keys: Tuple[str, ...] = ("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
+    ):
+        if not self.all_model_classes[0]._supports_sdpa:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
+
+        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+            self.skipTest(
+                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
+            )
+
+        # Convert to torch dtype
+        dtypes = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32,
+        }
+        torch_dtype = dtypes[torch_dtype]
+
+        atols = {
+            torch.float32: 1e-5,
+            torch.bfloat16: 3e-2,
+            torch.float16: 5e-3,
+        }
+        rtols = {
+            torch.float32: 1e-4,
+            torch.bfloat16: 3e-2,
+            torch.float16: 5e-3,
+        }
+
+        atol = atols[torch_dtype]
+        rtol = rtols[torch_dtype]
+
+        def get_mean_reldiff(msg, current_case, x, ref, atol, rtol):
+            return f"{msg} {current_case}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch_dtype,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+            for name, submodule in model_eager.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    raise ValueError("The eager model should not have SDPA attention layers")
+
+            has_sdpa = False
+            for name, submodule in model_sdpa.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    has_sdpa = True
+                    break
+            if not has_sdpa and model_sdpa.config.model_type != "falcon":
+                raise ValueError("The SDPA model should have SDPA attention layers")
+
+            # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
+            # but it would be nicer to have an efficient way to use parameterized.expand
+            cases = [
+                (use_mask, output_attentions, sdpa_backend, batch_size)
+                for use_mask in use_attention_mask_options
+                for output_attentions in [True, False]
+                for sdpa_backend in [
+                    SDPBackend.MATH,
+                    [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH],
+                    [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
+                    [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
+                ]
+                for batch_size in [1, 5]
+            ]
+            fail_cases = []
+
+            for use_mask, output_attentions, sdpa_backend, batch_size in cases:
+                processed_inputs = inputs_dict.copy()
+
+                # convert to torch_dtype
+                if "pixel_values" in processed_inputs:
+                    processed_inputs["pixel_values"] = processed_inputs["pixel_values"].to(torch_dtype)
+
+                # slice for different batch sizes
+                for key in ["pixel_values", "input_ids", "attention_mask"]:
+                    if key in processed_inputs:
+                        processed_inputs[key] = processed_inputs[key][:batch_size]
+
+                # set attention mask with left padding
+                if not use_mask:
+                    processed_inputs.pop("attention_mask", None)
+                else:
+                    dummy_attention_mask = processed_inputs["attention_mask"]
+                    dummy_attention_mask[:] = 1
+                    dummy_attention_mask[:, :1] = 0
+                    processed_inputs["attention_mask"] = dummy_attention_mask
+
+                processed_inputs["output_attentions"] = output_attentions
+                processed_inputs["output_hidden_states"] = True
+
+                current_case = (
+                    f"padding_side=left, use_mask={use_mask}, batch_size={batch_size}, sdpa_backend={sdpa_backend}"
+                )
+
+                prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+
+                with torch.no_grad():
+                    try:
+                        with sdpa_kernel(sdpa_backend):
+                            outputs_eager = model_eager(**prepared_inputs)
+                            outputs_sdpa = model_sdpa(**prepared_inputs)
+                    except Exception as e:
+                        fail_cases.append(f"{current_case}: {e}")
+                        continue
+
+                for key in logit_keys:
+                    eager_logits = outputs_eager[key]
+                    sdpa_logits = outputs_sdpa[key]
+
+                    if use_mask:
+                        eager_logits = eager_logits[:, 1:]
+                        sdpa_logits = sdpa_logits[:, 1:]
+
+                    is_close = torch.allclose(eager_logits, sdpa_logits, atol=atol, rtol=rtol)
+                    if not is_close:
+                        fail_cases.append(get_mean_reldiff(key, current_case, sdpa_logits, eager_logits, atol, rtol))
+
+            self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+
 class SiglipVisionModelTester:
     def __init__(
         self,
@@ -135,7 +299,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SiglipVisionModelTest(ModelTesterMixin, unittest.TestCase):
+class SiglipVisionModelTest(SiglipModelTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as SIGLIP does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -225,6 +389,17 @@ def test_model_from_pretrained(self):
         model = SiglipVisionModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("pooler_output", "last_hidden_state"),
+            use_attention_mask_options=(False,),
+        )
+
 
 class SiglipTextModelTester:
     def __init__(
@@ -314,7 +489,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SiglipTextModelTest(ModelTesterMixin, unittest.TestCase):
+class SiglipTextModelTest(SiglipModelTesterMixin, unittest.TestCase):
     all_model_classes = (SiglipTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -335,27 +510,19 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training
+    @unittest.skip(reason="SiglipTextModel does not support standalone training")
     def test_training(self):
         pass
 
-    @unittest.skip
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training_gradient_checkpointing
+    @unittest.skip(reason="SiglipTextModel does not support standalone training")
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training_gradient_checkpointing_use_reentrant
+    @unittest.skip(reason="SiglipTextModel does not support standalone training")
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training_gradient_checkpointing_use_reentrant_false
+    @unittest.skip(reason="SiglipTextModel does not support standalone training")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
@@ -384,6 +551,17 @@ def test_model_from_pretrained(self):
         model = SiglipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("pooler_output", "last_hidden_state"),
+            use_attention_mask_options=(False, True),
+        )
+
 
 class SiglipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -437,7 +615,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SiglipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (SiglipModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": SiglipModel} if is_torch_available() else {}
     fx_compatible = False
@@ -481,22 +659,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="SiglipModel does not support training")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="SiglipModel does not support training")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="SiglipModel does not support training")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="SiglipModel does not support training")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
     @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
     def test_initialization(self):
         pass
@@ -595,6 +757,100 @@ def test_model_from_pretrained(self):
         model = SiglipModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
+                )
+
+                # Test with attention mask
+                dummy_attention_mask = inputs_dict["attention_mask"]
+
+                if dummy_attention_mask is not None:
+                    dummy_attention_mask[:, 1:] = 1
+                    dummy_attention_mask[:, :1] = 0
+
+                outputs = model(
+                    pixel_values=dummy_pixel_values,
+                    input_ids=dummy_input_ids,
+                    attention_mask=dummy_attention_mask,
+                    output_hidden_states=True,
+                )
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values,
+                    input_ids=dummy_input_ids,
+                    attention_mask=dummy_attention_mask,
+                    output_hidden_states=True,
+                )
+
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
+                    f"Logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
+                    f"Logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
+                )
+
+                # check with inference + dropout
+                model.train()
+                _ = model_fa(
+                    pixel_values=dummy_pixel_values,
+                    input_ids=dummy_input_ids,
+                    attention_mask=dummy_attention_mask,
+                    output_hidden_states=True,
+                )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        self.skipTest("SigLIP does not support right padding")
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
+            use_attention_mask_options=(False, True),
+        )
+
 
 class SiglipForImageClassificationModelTester(SiglipModelTester):
     def __init__(self, parent):
@@ -618,7 +874,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SiglipForImageClassificationModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (SiglipForImageClassification,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": SiglipForImageClassification} if is_torch_available() else {}
     fx_compatible = False
@@ -660,6 +916,15 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_initialization(self):
         pass
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype, logit_keys=("logits",), use_attention_mask_options=(False,)
+        )
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index 290f31c3641cfb..99ba51e35f6663 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -181,6 +181,10 @@ def test_config(self):
     def test_pipeline_feature_extraction(self):
         super().test_pipeline_feature_extraction()
 
+    @is_flaky(description="torch 2.2.0 gives `Timeout >120.0s`")
+    def test_pipeline_feature_extraction_fp16(self):
+        super().test_pipeline_feature_extraction_fp16()
+
     @unittest.skip(reason="Need to fix this after #26538")
     def test_model_forward(self):
         set_seed(12345)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 86a89af8c13359..dcb495d95a6e4d 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1539,6 +1539,46 @@ def test_longform_generate_multi_batch(self):
     def test_longform_generate_multi_batch_cond_prev(self):
         self._check_longform_generate_multi_batch(condition_on_prev_tokens=True)
 
+    def test_custom_4d_attention_mask(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).to(device=torch_device, dtype=torch.float32)
+        model.eval()
+
+        (
+            input_ids,
+            position_ids,
+            input_ids_shared_prefix,
+            mask_shared_prefix,
+            position_ids_shared_prefix,
+        ) = self._get_custom_4d_mask_test_data()
+
+        with torch.no_grad():
+            logits = model.forward(
+                decoder_input_ids=input_ids,
+                input_features=input_dict["input_features"],
+                decoder_position_ids=position_ids,
+            ).logits
+            # logits.shape == torch.Size([3, 4, ...])
+
+            logits_shared_prefix = model(
+                decoder_input_ids=input_ids_shared_prefix,
+                input_features=input_dict["input_features"],
+                decoder_attention_mask=mask_shared_prefix,
+                decoder_position_ids=position_ids_shared_prefix,
+            )[0]
+            # logits_shared_prefix.shape == torch.Size([1, 6, ...])
+
+        out_last_tokens = logits[:, -1, :]  # last tokens in each batch line
+        out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :]  # last three tokens
+
+        # comparing greedily-chosen tokens:
+        assert torch.equal(out_last_tokens.max(axis=1).indices, out_shared_prefix_last_tokens.max(axis=1).indices)
+
+        # comparing softmax-normalized logits:
+        normalized_0 = torch.nn.functional.softmax(out_last_tokens)
+        normalized_1 = torch.nn.functional.softmax(out_shared_prefix_last_tokens)
+        torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
+
 
 @require_torch
 @require_torchaudio
@@ -2961,6 +3001,34 @@ def test_whisper_empty_longform_multi_gpu(self):
         torch.manual_seed(0)
         model.generate(**inputs, **gen_kwargs)
 
+    @slow
+    def test_tiny_static_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
+        input_features = input_features.to(torch_device)
+        eager_generated_ids = model.generate(input_features, max_new_tokens=64)
+
+        model.generation_config.cache_implementation = "static"
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+
+        # compile the forward pass and assert equivalence
+        static_generated_ids = model.generate(input_features, max_new_tokens=64)
+        assert (eager_generated_ids == static_generated_ids).all()
+
+        # check the compiled graph can be re-used and that the cache is correctly reset
+        # reverse the ordering of the input features
+        permutation_idx = (
+            torch.arange(input_features.shape[0], 0, step=-1, dtype=torch.long, device=input_features.device) - 1
+        )
+        input_features = input_features[permutation_idx, ...]
+        static_generated_ids = model.generate(input_features, max_new_tokens=64)
+        # assert re-ordered generations match those from eager
+        assert (eager_generated_ids[permutation_idx, :] == static_generated_ids).all()
+
 
 def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
     if head_mask is None:
@@ -3564,6 +3632,10 @@ def test_decoder_model_attn_mask_past(self):
             config=config, input_ids=inputs_dict["input_ids"]
         )
 
+    @unittest.skip(reason="Tested implicitly through the encoder-decoder tests")
+    def test_custom_4d_attention_mask(self):
+        pass
+
     @unittest.skip(reason="Generate needs input ids")
     def test_generate_without_input_ids(self):
         # generate only works with input ids for whisper
diff --git a/tests/models/zoedepth/__init__.py b/tests/models/zoedepth/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/zoedepth/test_image_processing_zoedepth.py b/tests/models/zoedepth/test_image_processing_zoedepth.py
new file mode 100644
index 00000000000000..7dd82daf0d5f24
--- /dev/null
+++ b/tests/models/zoedepth/test_image_processing_zoedepth.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import ZoeDepthImageProcessor
+
+
+class ZoeDepthImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        ensure_multiple_of=32,
+        keep_aspect_ratio=False,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_pad=False,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.ensure_multiple_of = ensure_multiple_of
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "ensure_multiple_of": self.ensure_multiple_of,
+            "keep_aspect_ratio": self.keep_aspect_ratio,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_pad": self.do_pad,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.ensure_multiple_of, self.ensure_multiple_of
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ZoeDepthImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+
+        self.image_processor_tester = ZoeDepthImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "ensure_multiple_of"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_ensure_multiple_of(self):
+        # Test variable by turning off all other variables which affect the size, size which is not multiple of 32
+        image = np.zeros((489, 640, 3))
+
+        size = {"height": 380, "width": 513}
+        multiple = 32
+        image_processor = ZoeDepthImageProcessor(
+            do_pad=False, ensure_multiple_of=multiple, size=size, keep_aspect_ratio=False
+        )
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        self.assertEqual(list(pixel_values.shape), [1, 3, 384, 512])
+        self.assertTrue(pixel_values.shape[2] % multiple == 0)
+        self.assertTrue(pixel_values.shape[3] % multiple == 0)
+
+        # Test variable by turning off all other variables which affect the size, size which is already multiple of 32
+        image = np.zeros((511, 511, 3))
+
+        height, width = 512, 512
+        size = {"height": height, "width": width}
+        multiple = 32
+        image_processor = ZoeDepthImageProcessor(
+            do_pad=False, ensure_multiple_of=multiple, size=size, keep_aspect_ratio=False
+        )
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        self.assertEqual(list(pixel_values.shape), [1, 3, height, width])
+        self.assertTrue(pixel_values.shape[2] % multiple == 0)
+        self.assertTrue(pixel_values.shape[3] % multiple == 0)
+
+    def test_keep_aspect_ratio(self):
+        # Test `keep_aspect_ratio=True` by turning off all other variables which affect the size
+        height, width = 489, 640
+        image = np.zeros((height, width, 3))
+
+        size = {"height": 512, "width": 512}
+        image_processor = ZoeDepthImageProcessor(do_pad=False, keep_aspect_ratio=True, size=size, ensure_multiple_of=1)
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        # As can be seen, the image is resized to the maximum size that fits in the specified size
+        self.assertEqual(list(pixel_values.shape), [1, 3, 512, 670])
+
+        # Test `keep_aspect_ratio=False` by turning off all other variables which affect the size
+        image_processor = ZoeDepthImageProcessor(
+            do_pad=False, keep_aspect_ratio=False, size=size, ensure_multiple_of=1
+        )
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        # As can be seen, the size is respected
+        self.assertEqual(list(pixel_values.shape), [1, 3, size["height"], size["width"]])
+
+        # Test `keep_aspect_ratio=True` with `ensure_multiple_of` set
+        image = np.zeros((489, 640, 3))
+
+        size = {"height": 511, "width": 511}
+        multiple = 32
+        image_processor = ZoeDepthImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=multiple)
+
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
+        self.assertTrue(pixel_values.shape[2] % multiple == 0)
+        self.assertTrue(pixel_values.shape[3] % multiple == 0)
diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py
new file mode 100644
index 00000000000000..571c44f2f47266
--- /dev/null
+++ b/tests/models/zoedepth/test_modeling_zoedepth.py
@@ -0,0 +1,257 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch ZoeDepth model."""
+
+import unittest
+
+from transformers import Dinov2Config, ZoeDepthConfig
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import ZoeDepthForDepthEstimation
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ZoeDepthImageProcessor
+
+
+class ZoeDepthModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        num_channels=3,
+        image_size=32,
+        patch_size=16,
+        use_labels=True,
+        num_labels=3,
+        is_training=True,
+        hidden_size=4,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=8,
+        out_features=["stage1", "stage2"],
+        apply_layernorm=False,
+        reshape_hidden_states=False,
+        neck_hidden_sizes=[2, 2],
+        fusion_hidden_size=6,
+        bottleneck_features=6,
+        num_out_features=[6, 6, 6, 6],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.out_features = out_features
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.is_training = is_training
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.bottleneck_features = bottleneck_features
+        self.num_out_features = num_out_features
+        # ZoeDepth's sequence length
+        self.seq_length = (self.image_size // self.patch_size) ** 2 + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ZoeDepthConfig(
+            backbone_config=self.get_backbone_config(),
+            backbone=None,
+            neck_hidden_sizes=self.neck_hidden_sizes,
+            fusion_hidden_size=self.fusion_hidden_size,
+            bottleneck_features=self.bottleneck_features,
+            num_out_features=self.num_out_features,
+        )
+
+    def get_backbone_config(self):
+        return Dinov2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            is_training=self.is_training,
+            out_features=self.out_features,
+            reshape_hidden_states=self.reshape_hidden_states,
+        )
+
+    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = ZoeDepthForDepthEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ZoeDepthModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ZoeDepth does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (ZoeDepthForDepthEstimation,) if is_torch_available() else ()
+    pipeline_model_mapping = {"depth-estimation": ZoeDepthForDepthEstimation} if is_torch_available() else {}
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ZoeDepthModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=ZoeDepthConfig, has_text_modality=False, hidden_size=37, common_properties=[]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model and hence no input_embeddings")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model and hence no input_embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    def test_for_depth_estimation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
+
+    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model and hence no input_embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="ZoeDepth does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Intel/zoedepth-nyu"
+        model = ZoeDepthForDepthEstimation.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+@slow
+class ZoeDepthModelIntegrationTest(unittest.TestCase):
+    def test_inference_depth_estimation(self):
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu")
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu").to(torch_device)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        # verify the predicted depth
+        expected_shape = torch.Size((1, 384, 512))
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[1.0020, 1.0219, 1.0389], [1.0349, 1.0816, 1.1000], [1.0576, 1.1094, 1.1249]],
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_depth_estimation_multiple_heads(self):
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        # verify the predicted depth
+        expected_shape = torch.Size((1, 384, 512))
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[1.1571, 1.1438, 1.1783], [1.2163, 1.2036, 1.2320], [1.2688, 1.2461, 1.2734]],
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index cdedf94be180f5..a8c5deb2284452 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -35,8 +35,10 @@ class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        audio_classifier = AudioClassificationPipeline(
+            model=model, feature_extractor=processor, torch_dtype=torch_dtype
+        )
 
         # test with a raw waveform
         audio = np.zeros((34000,))
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 35c36aa0e660ee..11bbde4143f7e8 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -66,14 +66,14 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
             self.skipTest(reason="No tokenizer available")
 
         speech_recognizer = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=processor
+            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
         )
 
         # test with a raw waveform
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index 259ab5ef4c3915..1f2700fa747c58 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -56,8 +56,8 @@ def hashimage(image: Image) -> str:
 class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index d1fd87e18e377c..41a6a0c383f9b4 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -61,9 +61,13 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         dqa_pipeline = pipeline(
-            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor
+            "document-question-answering",
+            model=model,
+            tokenizer=tokenizer,
+            image_processor=processor,
+            torch_dtype=torch_dtype,
         )
 
         image = INVOICE_URL
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index ff5f8314b65cc4..4d25941c3f0fd9 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -174,7 +174,7 @@ def get_shape(self, input_, shape=None):
             raise ValueError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None:
             self.skipTest(reason="No tokenizer")
         elif (
@@ -193,7 +193,9 @@ def get_test_pipeline(self, model, tokenizer, processor):
                 For now ignore those.
                 """
             )
-        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor)
+        feature_extractor = FeatureExtractionPipeline(
+            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
+        )
         return feature_extractor, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index 93dacbd15bf4fd..81aa23563710f7 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -251,11 +251,11 @@ def test_model_no_pad_tf(self):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest(reason="The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         examples = [
             f"This is another {tokenizer.mask_token} test",
         ]
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 3e93f31d18807e..823c66c16f32c0 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -55,8 +55,10 @@ class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_classifier = ImageClassificationPipeline(model=model, image_processor=processor, top_k=2)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        image_classifier = ImageClassificationPipeline(
+            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+        )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 53af000d6de93d..07b27e7b6465bc 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -157,7 +157,7 @@ def test_return_tensors_tf(self):
         outputs = feature_extractor(img, return_tensors=True)
         self.assertTrue(tf.is_tensor(outputs))
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if processor is None:
             self.skipTest(reason="No image processor")
 
@@ -175,7 +175,9 @@ def get_test_pipeline(self, model, tokenizer, processor):
                 """
             )
 
-        feature_extractor = ImageFeatureExtractionPipeline(model=model, image_processor=processor)
+        feature_extractor = ImageFeatureExtractionPipeline(
+            model=model, image_processor=processor, torch_dtype=torch_dtype
+        )
         img = prepare_img()
         return feature_extractor, [img, img]
 
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 8f2ae47f14ba36..523bd0b52b6823 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -87,8 +87,8 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_image_to_image.py b/tests/pipelines/test_pipelines_image_to_image.py
index e9110bb692954e..29d590a8e34c4d 100644
--- a/tests/pipelines/test_pipelines_image_to_image.py
+++ b/tests/pipelines/test_pipelines_image_to_image.py
@@ -54,9 +54,9 @@ class ImageToImagePipelineTests(unittest.TestCase):
     @require_torch
     @require_vision
     @slow
-    def test_pipeline(self):
+    def test_pipeline(self, torch_dtype="float32"):
         model_id = "caidas/swin2SR-classical-sr-x2-64"
-        upscaler = pipeline("image-to-image", model=model_id)
+        upscaler = pipeline("image-to-image", model=model_id, torch_dtype=torch_dtype)
         upscaled_list = upscaler(self.examples)
 
         self.assertEqual(len(upscaled_list), len(self.examples))
@@ -66,6 +66,12 @@ def test_pipeline(self):
         self.assertEqual(upscaled_list[0].size, (1296, 976))
         self.assertEqual(upscaled_list[1].size, (1296, 976))
 
+    @require_torch
+    @require_vision
+    @slow
+    def test_pipeline_fp16(self):
+        self.test_pipeline(torch_dtype="float16")
+
     @require_torch
     @require_vision
     @slow
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index c77353a261f91d..43a796da46df6c 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -17,7 +17,7 @@
 import requests
 
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
-from transformers.pipelines import pipeline
+from transformers.pipelines import ImageToTextPipeline, pipeline
 from transformers.testing_utils import (
     is_pipeline_test,
     require_tf,
@@ -45,8 +45,10 @@ class ImageToTextPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        pipe = ImageToTextPipeline(
+            model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype
+        )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index 643ee84e683717..50fcd676da50df 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -67,8 +67,8 @@ class MaskGenerationPipelineTests(unittest.TestCase):
         (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index 76a6ab807cd993..f14e5e6b68d771 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -53,8 +53,8 @@ def open(*args, **kwargs):
 class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
     def run_pipeline_test(self, object_detector, examples):
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index f7683aec15c32d..8b68989600ee1a 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -50,12 +50,12 @@ class QAPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
             return None, None
-        question_answerer = QuestionAnsweringPipeline(model, tokenizer)
+        question_answerer = QuestionAnsweringPipeline(model, tokenizer, torch_dtype=torch_dtype)
 
         examples = [
             {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 8d745c376d84cd..fb1dce0ca3849e 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -32,8 +32,8 @@ class SummarizationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
     def run_pipeline_test(self, summarizer, _):
diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py
index a30763fc096d7e..9481ab200063f8 100644
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -152,9 +152,9 @@ def test_small_model_tf(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype="float32"):
         model_id = "lysandre/tiny-tapas-random-wtq"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         self.assertIsInstance(model.config.aggregation_labels, dict)
         self.assertIsInstance(model.config.no_aggregation_label_index, int)
@@ -255,9 +255,14 @@ def test_small_model_pt(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_slow_tokenizer_sqa_pt(self):
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype="float16")
+
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @require_torch
+    def test_slow_tokenizer_sqa_pt(self, torch_dtype="float32"):
         model_id = "lysandre/tiny-tapas-random-sqa"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
 
@@ -373,6 +378,11 @@ def test_slow_tokenizer_sqa_pt(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @require_torch
+    def test_slow_tokenizer_sqa_pt_fp16(self):
+        self.test_slow_tokenizer_sqa_pt(torch_dtype="float16")
+
     @require_tf
     @require_tensorflow_probability
     @require_pandas
@@ -498,8 +508,8 @@ def test_slow_tokenizer_sqa_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_wtq_pt(self):
-        table_querier = pipeline("table-question-answering")
+    def test_integration_wtq_pt(self, torch_dtype="float32"):
+        table_querier = pipeline("table-question-answering", torch_dtype=torch_dtype)
 
         data = {
             "Repository": ["Transformers", "Datasets", "Tokenizers"],
@@ -541,6 +551,12 @@ def test_integration_wtq_pt(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @slow
+    @require_torch
+    def test_integration_wtq_pt_fp16(self):
+        self.test_integration_wtq_pt(torch_dtype="float16")
+
     @slow
     @require_tensorflow_probability
     @require_pandas
@@ -593,11 +609,12 @@ def test_integration_wtq_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_sqa_pt(self):
+    def test_integration_sqa_pt(self, torch_dtype="float32"):
         table_querier = pipeline(
             "table-question-answering",
             model="google/tapas-base-finetuned-sqa",
             tokenizer="google/tapas-base-finetuned-sqa",
+            torch_dtype=torch_dtype,
         )
         data = {
             "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
@@ -615,6 +632,12 @@ def test_integration_sqa_pt(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @slow
+    @require_torch
+    def test_integration_sqa_pt_fp16(self):
+        self.test_integration_sqa_pt(torch_dtype="float16")
+
     @slow
     @require_tensorflow_probability
     @require_pandas
@@ -645,11 +668,12 @@ def test_integration_sqa_tf(self):
 
     @slow
     @require_torch
-    def test_large_model_pt_tapex(self):
+    def test_large_model_pt_tapex(self, torch_dtype="float32"):
         model_id = "microsoft/tapex-large-finetuned-wtq"
         table_querier = pipeline(
             "table-question-answering",
             model=model_id,
+            torch_dtype=torch_dtype,
         )
         data = {
             "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index eccae9850b3b59..52fb59edd36443 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -35,8 +35,8 @@ class Text2TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return generator, ["Something to write", "Something else"]
 
     def run_pipeline_test(self, generator, _):
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 63adfc45a02981..4956cb8aed132d 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -179,8 +179,8 @@ def test_tf_bert(self):
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
     def run_pipeline_test(self, text_classifier, _):
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 00ddd77f826071..94132b5f55978b 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -320,8 +320,8 @@ def test_small_chat_model_tf(self):
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_generator, ["This is a test", "Another test"]
 
     def test_stop_sequence_stopping_criteria(self):
@@ -398,7 +398,7 @@ def run_pipeline_test(self, text_generator, _):
             self.assertEqual(outputs, [{"generated_text": ANY(str)}])
         else:
             with self.assertRaises((ValueError, AssertionError)):
-                outputs = text_generator("")
+                outputs = text_generator("", add_special_tokens=False)
 
         if text_generator.framework == "tf":
             # TF generation does not support max_new_tokens, and it's impossible
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index b780d26d79a43a..655fe5961b527d 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -250,8 +250,8 @@ def test_generative_model_kwargs(self):
         outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs)
         self.assertListEqual(outputs["audio"].tolist(), audio.tolist())
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return speech_generator, ["This is a test", "Another test"]
 
     def run_pipeline_test(self, speech_generator, _):
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index eda9ac014bf730..41415c8c34589e 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -56,8 +56,8 @@ class TokenClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
     def run_pipeline_test(self, token_classifier, _):
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index 61d390fe76ebc1..c31ba49e7660a2 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -35,12 +35,14 @@ class TranslationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
+            translator = TranslationPipeline(
+                model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, torch_dtype=torch_dtype
+            )
         else:
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer)
+            translator = TranslationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return translator, ["Some string", "Some other text"]
 
     def run_pipeline_test(self, translator, _):
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 392d3b31b4c950..280d6990788ea1 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -38,11 +38,13 @@
 class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
-        video_classifier = VideoClassificationPipeline(model=model, image_processor=processor, top_k=2)
+        video_classifier = VideoClassificationPipeline(
+            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+        )
         examples = [
             example_video_filepath,
             "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index e056adee23311f..45f935a62aaf38 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -55,8 +55,10 @@ def open(*args, **kwargs):
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        vqa_pipeline = pipeline(
+            "visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa", torch_dtype=torch_dtype
+        )
         examples = [
             {
                 "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 2e61d97c1dc8c9..1003898df6c968 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -42,9 +42,9 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         classifier = ZeroShotClassificationPipeline(
-            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
+            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"], torch_dtype=torch_dtype
         )
         return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 60562fe7aa1184..c0894fb394b72c 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -28,9 +28,11 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
     # model_mapping = {CLAPConfig: CLAPModel}
 
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype="float32"):
         audio_classifier = pipeline(
-            task="zero-shot-audio-classification", model="hf-internal-testing/tiny-clap-htsat-unfused"
+            task="zero-shot-audio-classification",
+            model="hf-internal-testing/tiny-clap-htsat-unfused",
+            torch_dtype=torch_dtype,
         )
         dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         audio = dataset["train"]["audio"][-1]["array"]
@@ -40,6 +42,10 @@ def test_small_model_pt(self):
             [{"score": 0.501, "label": "Sound of a dog"}, {"score": 0.499, "label": "Sound of vaccum cleaner"}],
         )
 
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype="float16")
+
     @unittest.skip(reason="No models are available in TF")
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index 5c3208866ee251..b4501e43733572 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -71,9 +71,9 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
     #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype="float32"):
         image_classifier = pipeline(
-            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification",
+            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", torch_dtype=torch_dtype
         )
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         output = image_classifier(image, candidate_labels=["a", "b", "c"])
@@ -127,6 +127,10 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype="float16")
+
     @require_tf
     def test_small_model_tf(self):
         image_classifier = pipeline(
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index 065e5c211e67e5..799c54dfbb87d7 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -43,9 +43,11 @@ def open(*args, **kwargs):
 class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         object_detector = pipeline(
-            "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
+            "zero-shot-object-detection",
+            model="hf-internal-testing/tiny-random-owlvit-object-detection",
+            torch_dtype=torch_dtype,
         )
 
         examples = [
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 57521a3e7c530d..8d6ae394cfd3b3 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -20,7 +20,7 @@
 
 from transformers import is_torch_available
 
-from .test_configuration_utils import config_common_kwargs
+from .utils.test_configuration_utils import config_common_kwargs
 
 
 class ConfigTester(object):
diff --git a/tests/test_image_processing_utils.py b/tests/test_image_processing_utils.py
deleted file mode 100644
index bab0769c922068..00000000000000
--- a/tests/test_image_processing_utils.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
-
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-
-from transformers import AutoImageProcessor, ViTImageProcessor
-from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
-
-
-SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
-
-
-class ImageProcessorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_image_processor_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
-
-        config = AutoImageProcessor.from_pretrained(
-            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
-        )
-
-        self.assertIsNotNone(config)
-
-
-@is_staging_test
-class ImageProcessorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-image-processor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("test-image-processor", token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="test-image-processor", push_to_hub=True, token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_in_organization(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("valid_org/test-image-processor", token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_dynamic_image_processor(self):
-        CustomImageProcessor.register_for_auto_class()
-        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-
-        image_processor.push_to_hub("test-dynamic-image-processor", token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            image_processor.auto_map,
-            {"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"},
-        )
-
-        new_image_processor = AutoImageProcessor.from_pretrained(
-            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
-        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4561c93c21db31..299d99280b335b 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -119,7 +119,7 @@
 if is_flax_available():
     import jax.numpy as jnp
 
-    from tests.test_modeling_flax_utils import check_models_equal
+    from tests.utils.test_modeling_flax_utils import check_models_equal
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -1158,6 +1158,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                     "input_features",
                     "input_ids",
                     "input_values",
+                    "inputs_embeds",
                     "pixel_values",
                     "token_type_ids",
                     "visual_feats",
@@ -1214,16 +1215,27 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                             (past_mask, inputs_to_test[1]["attention_mask"]), dim=1
                         )
 
+            if "inputs_embeds" in inspect.signature(model.forward).parameters and not model.config.is_encoder_decoder:
+                inputs_to_test.append(
+                    {
+                        "inputs_embeds": torch.rand(
+                            2, 2, model.config.hidden_size, dtype=torch.float, device=torch_device
+                        )
+                    }
+                )
+
             for inps in inputs_to_test:
                 filtered_inputs = {k: v for (k, v) in inps.items() if k in input_names}
-                input_names = list(filtered_inputs.keys())
+                input_names_to_trace = list(filtered_inputs.keys())
 
                 if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
                     not hasattr(model.config, "problem_type") or model.config.problem_type is None
                 ):
                     model.config.problem_type = "single_label_classification"
 
-                traced_model = symbolic_trace(model, input_names)
+                model.config.use_cache = "past_key_values" in input_names_to_trace
+
+                traced_model = symbolic_trace(model, input_names_to_trace)
 
                 with torch.no_grad():
                     traced_output = traced_model(**filtered_inputs)
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index f2292510f71192..6ca7ea0681db58 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -126,16 +126,18 @@ class PipelineTesterMixin:
     pipeline_model_mapping = None
     supported_frameworks = ["pt", "tf"]
 
-    def run_task_tests(self, task):
+    def run_task_tests(self, task, torch_dtype="float32"):
         """Run pipeline tests for a specific `task`
 
         Args:
             task (`str`):
                 A task name. This should be a key in the mapping `pipeline_test_mapping`.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         if task not in self.pipeline_model_mapping:
             self.skipTest(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: `{task}` is not in "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: `{task}` is not in "
                 f"`self.pipeline_model_mapping` for `{self.__class__.__name__}`."
             )
 
@@ -171,10 +173,12 @@ def run_task_tests(self, task):
                 repo_name = model_arch_name
 
             self.run_model_pipeline_tests(
-                task, repo_name, model_architecture, tokenizer_names, processor_names, commit
+                task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype
             )
 
-    def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit):
+    def run_model_pipeline_tests(
+        self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype="float32"
+    ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
 
         Args:
@@ -188,6 +192,10 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                 A list of names of a subclasses of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
             processor_names (`List[str]`):
                 A list of names of subclasses of `BaseImageProcessor` or `FeatureExtractionMixin`.
+            commit (`str`):
+                The commit hash of the model repository on the Hub.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         # Get an instance of the corresponding class `XXXPipelineTests` in order to use `get_test_pipeline` and
         # `run_pipeline_test`.
@@ -203,14 +211,18 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                     processor_name,
                 ):
                     logger.warning(
-                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                         f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                         f"`{tokenizer_name}` | processor `{processor_name}`."
                     )
                     continue
-                self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit)
+                self.run_pipeline_test(
+                    task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype
+                )
 
-    def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit):
+    def run_pipeline_test(
+        self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype="float32"
+    ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
 
         The model will be loaded from a model repository on the Hub.
@@ -226,6 +238,10 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
                 The name of a subclass of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
             processor_name (`str`):
                 The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
+            commit (`str`):
+                The commit hash of the model repository on the Hub.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
         if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
@@ -245,7 +261,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
                 processor = processor_class.from_pretrained(repo_id, revision=commit)
             except Exception:
                 logger.warning(
-                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
+                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not load the "
                     f"processor from `{repo_id}` with `{processor_name}`."
                 )
                 self.skipTest(f"Could not load the processor from {repo_id} with {processor_name}.")
@@ -253,7 +269,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
         # TODO: Maybe not upload such problematic tiny models to Hub.
         if tokenizer is None and processor is None:
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
                 f"any tokenizer / processor from `{repo_id}`."
             )
             self.skipTest(f"Could not find or load any tokenizer / processor from {repo_id}.")
@@ -263,7 +279,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
             model = model_architecture.from_pretrained(repo_id, revision=commit)
         except Exception:
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
                 f"the model from `{repo_id}` with `{model_architecture}`."
             )
             self.skipTest(f"Could not find or load the model from {repo_id} with {model_architecture}.")
@@ -271,7 +287,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
         if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                 f"`{tokenizer_name}` | processor `{processor_name}`."
             )
@@ -289,12 +305,12 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
         # `run_pipeline_test`.
         task_test = pipeline_test_mapping[task]["test"]()
 
-        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor)
+        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor, torch_dtype=torch_dtype)
         if pipeline is None:
             # The test can disable itself, but it should be very marginal
             # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not get the "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not get the "
                 "pipeline for testing."
             )
             self.skipTest(reason="Could not get the pipeline for testing.")
@@ -324,10 +340,20 @@ def data(n):
     def test_pipeline_audio_classification(self):
         self.run_task_tests(task="audio-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_audio_classification_fp16(self):
+        self.run_task_tests(task="audio-classification", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_automatic_speech_recognition(self):
         self.run_task_tests(task="automatic-speech-recognition")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_automatic_speech_recognition_fp16(self):
+        self.run_task_tests(task="automatic-speech-recognition", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     @require_timm
@@ -335,6 +361,13 @@ def test_pipeline_automatic_speech_recognition(self):
     def test_pipeline_depth_estimation(self):
         self.run_task_tests(task="depth-estimation")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_depth_estimation_fp16(self):
+        self.run_task_tests(task="depth-estimation", torch_dtype="float16")
+
     @is_pipeline_test
     @require_pytesseract
     @require_torch
@@ -342,20 +375,43 @@ def test_pipeline_depth_estimation(self):
     def test_pipeline_document_question_answering(self):
         self.run_task_tests(task="document-question-answering")
 
+    @is_pipeline_test
+    @require_pytesseract
+    @require_torch
+    @require_vision
+    def test_pipeline_document_question_answering_fp16(self):
+        self.run_task_tests(task="document-question-answering", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_feature_extraction(self):
         self.run_task_tests(task="feature-extraction")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_feature_extraction_fp16(self):
+        self.run_task_tests(task="feature-extraction", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_fill_mask(self):
         self.run_task_tests(task="fill-mask")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_fill_mask_fp16(self):
+        self.run_task_tests(task="fill-mask", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch_or_tf
     @require_vision
     def test_pipeline_image_classification(self):
         self.run_task_tests(task="image-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_classification_fp16(self):
+        self.run_task_tests(task="image-classification", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     @require_timm
@@ -363,11 +419,24 @@ def test_pipeline_image_classification(self):
     def test_pipeline_image_segmentation(self):
         self.run_task_tests(task="image-segmentation")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_image_segmentation_fp16(self):
+        self.run_task_tests(task="image-segmentation", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     def test_pipeline_image_to_text(self):
         self.run_task_tests(task="image-to-text")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_to_text_fp16(self):
+        self.run_task_tests(task="image-to-text", torch_dtype="float16")
+
     @is_pipeline_test
     @require_timm
     @require_vision
@@ -375,6 +444,13 @@ def test_pipeline_image_to_text(self):
     def test_pipeline_image_feature_extraction(self):
         self.run_task_tests(task="image-feature-extraction")
 
+    @is_pipeline_test
+    @require_timm
+    @require_vision
+    @require_torch
+    def test_pipeline_image_feature_extraction_fp16(self):
+        self.run_task_tests(task="image-feature-extraction", torch_dtype="float16")
+
     @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
     @is_pipeline_test
     @require_vision
@@ -382,6 +458,13 @@ def test_pipeline_image_feature_extraction(self):
     def test_pipeline_mask_generation(self):
         self.run_task_tests(task="mask-generation")
 
+    @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_mask_generation_fp16(self):
+        self.run_task_tests(task="mask-generation", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     @require_timm
@@ -389,44 +472,96 @@ def test_pipeline_mask_generation(self):
     def test_pipeline_object_detection(self):
         self.run_task_tests(task="object-detection")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_object_detection_fp16(self):
+        self.run_task_tests(task="object-detection", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_question_answering(self):
         self.run_task_tests(task="question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_question_answering_fp16(self):
+        self.run_task_tests(task="question-answering", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_summarization(self):
         self.run_task_tests(task="summarization")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_summarization_fp16(self):
+        self.run_task_tests(task="summarization", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_table_question_answering(self):
         self.run_task_tests(task="table-question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_table_question_answering_fp16(self):
+        self.run_task_tests(task="table-question-answering", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_text2text_generation(self):
         self.run_task_tests(task="text2text-generation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text2text_generation_fp16(self):
+        self.run_task_tests(task="text2text-generation", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_text_classification(self):
         self.run_task_tests(task="text-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_classification_fp16(self):
+        self.run_task_tests(task="text-classification", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch_or_tf
     def test_pipeline_text_generation(self):
         self.run_task_tests(task="text-generation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_generation_fp16(self):
+        self.run_task_tests(task="text-generation", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch
     def test_pipeline_text_to_audio(self):
         self.run_task_tests(task="text-to-audio")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_to_audio_fp16(self):
+        self.run_task_tests(task="text-to-audio", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_token_classification(self):
         self.run_task_tests(task="token-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_token_classification_fp16(self):
+        self.run_task_tests(task="token-classification", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_translation(self):
         self.run_task_tests(task="translation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_translation_fp16(self):
+        self.run_task_tests(task="translation", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch_or_tf
     @require_vision
@@ -434,32 +569,67 @@ def test_pipeline_translation(self):
     def test_pipeline_video_classification(self):
         self.run_task_tests(task="video-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_decord
+    @require_torch
+    def test_pipeline_video_classification_fp16(self):
+        self.run_task_tests(task="video-classification", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch
     @require_vision
     def test_pipeline_visual_question_answering(self):
         self.run_task_tests(task="visual-question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    @require_vision
+    def test_pipeline_visual_question_answering_fp16(self):
+        self.run_task_tests(task="visual-question-answering", torch_dtype="float16")
+
     @is_pipeline_test
     def test_pipeline_zero_shot(self):
         self.run_task_tests(task="zero-shot")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_zero_shot_fp16(self):
+        self.run_task_tests(task="zero-shot", torch_dtype="float16")
+
     @is_pipeline_test
     @require_torch
     def test_pipeline_zero_shot_audio_classification(self):
         self.run_task_tests(task="zero-shot-audio-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_zero_shot_audio_classification_fp16(self):
+        self.run_task_tests(task="zero-shot-audio-classification", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     def test_pipeline_zero_shot_image_classification(self):
         self.run_task_tests(task="zero-shot-image-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_zero_shot_image_classification_fp16(self):
+        self.run_task_tests(task="zero-shot-image-classification", torch_dtype="float16")
+
     @is_pipeline_test
     @require_vision
     @require_torch
     def test_pipeline_zero_shot_object_detection(self):
         self.run_task_tests(task="zero-shot-object-detection")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_zero_shot_object_detection_fp16(self):
+        self.run_task_tests(task="zero-shot-object-detection", torch_dtype="float16")
+
     # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 26fa4624674ec5..e31e6cb8229b84 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1653,6 +1653,84 @@ def test_galore_adafactor_all_linear(self):
         self.assertTrue(galore_peak_memory < upper_bound_pm)
         self.assertTrue(lower_bound_pm < galore_peak_memory)
 
+    @require_galore_torch
+    @require_torch_gpu
+    def test_galore_lr_display_without_scheduler(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            learning_rate = 1e-9
+            num_steps = 10
+
+            # Trainer without inf/nan filter
+            args = TrainingArguments(
+                tmpdir,
+                learning_rate=learning_rate,
+                logging_steps=5,
+                optim="galore_adamw",
+                optim_target_modules=[r".*attn.*", r".*mlp.*"],
+            )
+            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+
+            # reflects displayed lr in trainer
+            self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
+
+    @require_galore_torch
+    @require_torch_gpu
+    def test_galore_lr_display_with_scheduler(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            learning_rate = 2e-4
+            num_train_epochs = 2
+            num_warmup_steps = 5
+
+            # Trainer without inf/nan filter
+            args = TrainingArguments(
+                tmpdir,
+                num_train_epochs=num_train_epochs,
+                learning_rate=learning_rate,
+                warmup_steps=num_warmup_steps,
+                lr_scheduler_type="cosine",
+                logging_steps=1,
+                optim="galore_adamw",
+                optim_target_modules=[r".*attn.*", r".*mlp.*"],
+            )
+            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+            # creating log history of trainer, results don't matter
+            trainer.train()
+            logs = trainer.state.log_history[1:][:-1]
+
+            # reach given learning rate peak and end with 0 lr
+            self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
+            self.assertTrue(logs[-1]["learning_rate"] == 0)
+
+            # increasing and decreasing pattern of lrs
+            increasing_lrs = [
+                logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
+                for i in range(len(logs))
+                if i < num_warmup_steps - 2
+            ]
+            decreasing_lrs = [
+                logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
+                for i in range(len(logs) - 1)
+                if i >= num_warmup_steps - 2
+            ]
+
+            self.assertTrue(all(increasing_lrs))
+            self.assertTrue(all(decreasing_lrs))
+
+            # warm up steps << total steps
+            self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
+
     @require_torch_multi_accelerator
     def test_data_is_not_parallelized_when_model_is_parallel(self):
         model = RegressionModel()
diff --git a/tests/test_cache_utils.py b/tests/utils/test_cache_utils.py
similarity index 100%
rename from tests/test_cache_utils.py
rename to tests/utils/test_cache_utils.py
diff --git a/tests/utils/test_chat_template_utils.py b/tests/utils/test_chat_template_utils.py
index cff31c1f8a3483..1816ddd9512693 100644
--- a/tests/utils/test_chat_template_utils.py
+++ b/tests/utils/test_chat_template_utils.py
@@ -137,7 +137,7 @@ def fn(x: List[List[Union[str, int]]]):
                 "properties": {
                     "x": {
                         "type": "array",
-                        "items": {"type": "array", "items": {"type": ["string", "integer"]}},
+                        "items": {"type": "array", "items": {"type": ["integer", "string"]}},
                         "description": "The input",
                     }
                 },
@@ -455,13 +455,13 @@ def fn(
                     },
                     "y": {
                         "type": "array",
-                        "items": {"type": ["string", "integer"]},
+                        "items": {"type": ["integer", "string"]},
                         "nullable": True,
                         "description": "The second input. It's a big list with a single-line description.",
                     },
                     "z": {
                         "type": "array",
-                        "prefixItems": [{"type": ["string", "integer"]}, {"type": "string"}],
+                        "prefixItems": [{"type": ["integer", "string"]}, {"type": "string"}],
                         "description": "The third input. It's some kind of tuple with a default arg.",
                     },
                 },
diff --git a/tests/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
similarity index 99%
rename from tests/test_configuration_utils.py
rename to tests/utils/test_configuration_utils.py
index b9f090e061fa72..6809b3a2ce5f0c 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -30,7 +30,7 @@
 from transformers.testing_utils import TOKEN, USER, is_staging_test
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
+sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
 
 from test_module.custom_configuration import CustomConfig  # noqa E402
 
diff --git a/tests/test_feature_extraction_utils.py b/tests/utils/test_feature_extraction_utils.py
similarity index 100%
rename from tests/test_feature_extraction_utils.py
rename to tests/utils/test_feature_extraction_utils.py
diff --git a/tests/utils/test_image_processing_utils.py b/tests/utils/test_image_processing_utils.py
index afb6283e6ed345..4b2586a634d8a2 100644
--- a/tests/utils/test_image_processing_utils.py
+++ b/tests/utils/test_image_processing_utils.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,9 +13,140 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+import tempfile
 import unittest
+import unittest.mock as mock
+from pathlib import Path
 
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoImageProcessor, ViTImageProcessor
 from transformers.image_processing_utils import get_size_dict
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+
+
+SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class ImageProcessorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_image_processor_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
+
+        config = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
+        )
+
+        self.assertIsNotNone(config)
+
+
+@is_staging_test
+class ImageProcessorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-image-processor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("test-image-processor", token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="test-image-processor", push_to_hub=True, token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_in_organization(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("valid_org/test-image-processor", token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_dynamic_image_processor(self):
+        CustomImageProcessor.register_for_auto_class()
+        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+
+        image_processor.push_to_hub("test-dynamic-image-processor", token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            image_processor.auto_map,
+            {"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"},
+        )
+
+        new_image_processor = AutoImageProcessor.from_pretrained(
+            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
+        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
 
 
 class ImageProcessingUtilsTester(unittest.TestCase):
diff --git a/tests/test_modeling_flax_utils.py b/tests/utils/test_modeling_flax_utils.py
similarity index 100%
rename from tests/test_modeling_flax_utils.py
rename to tests/utils/test_modeling_flax_utils.py
diff --git a/tests/test_modeling_tf_utils.py b/tests/utils/test_modeling_tf_utils.py
similarity index 100%
rename from tests/test_modeling_tf_utils.py
rename to tests/utils/test_modeling_tf_utils.py
diff --git a/tests/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
old mode 100755
new mode 100644
similarity index 98%
rename from tests/test_modeling_utils.py
rename to tests/utils/test_modeling_utils.py
index 758fe4d1fdf398..c86c340017b0d9
--- a/tests/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -73,7 +73,7 @@
 )
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
+sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
 
 from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
 
@@ -1065,6 +1065,23 @@ def test_cached_files_are_used_when_internet_is_down(self):
             # This check we did call the fake head request
             mock_head.assert_called()
 
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_save_model_with_device_map_cpu(self):
+        model_id = "hf-internal-testing/tiny-random-gpt2"
+        inputs = torch.tensor([[1, 2, 3]])
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+            output = model(inputs)[0]
+            model.save_pretrained(
+                tmp_dir, max_shard_size="200KB"
+            )  # model is 1.6MB, max shard size is allocated to cpu by default
+            saved_model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map="cpu")
+            saved_model_output = saved_model(inputs)[0]
+
+        self.assertTrue(torch.allclose(output, saved_model_output))
+
     @require_accelerate
     @mark.accelerate_tests
     @require_torch_accelerator
@@ -1083,9 +1100,9 @@ def test_save_offloaded_model(self):
 
         # check_models_equal requires onloaded tensors
         model_id = "hf-internal-testing/tiny-random-gpt2"
-        onloaded_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+        onloaded_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu").to(f"{torch_device}:0")
         inputs = torch.tensor([[1, 2, 3]]).to(f"{torch_device}:0")
-        cpu_output = onloaded_model(inputs)[0]
+        output = onloaded_model(inputs)[0]
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             offload_folder = os.path.join(tmp_dir, "offload")
@@ -1099,7 +1116,7 @@ def test_save_offloaded_model(self):
             saved_model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map=device_map)
             postsaved_output = saved_model(inputs)[0]
 
-        self.assertTrue(torch.allclose(cpu_output, presaved_output, atol=1e-4))
+        self.assertTrue(torch.allclose(output, presaved_output, atol=1e-4))
         self.assertTrue(torch.allclose(presaved_output, postsaved_output))
 
     @require_safetensors
@@ -1407,20 +1424,15 @@ def test_pretrained_low_mem_new_config(self):
             self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
 
     def test_generation_config_is_loaded_with_model(self):
-        # Note: `joaogante/tiny-random-gpt2-with-generation-config` has a `generation_config.json` containing a dummy
-        # `transformers_version` field set to `foo`. If loading the file fails, this test also fails.
+        # Note: `TinyLlama/TinyLlama-1.1B-Chat-v1.0` has a `generation_config.json` containing `max_length: 2048`
 
         # 1. Load without further parameters
-        model = AutoModelForCausalLM.from_pretrained(
-            "joaogante/tiny-random-gpt2-with-generation-config", use_safetensors=False
-        )
-        self.assertEqual(model.generation_config.transformers_version, "foo")
+        model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+        self.assertEqual(model.generation_config.max_length, 2048)
 
         # 2. Load with `device_map`
-        model = AutoModelForCausalLM.from_pretrained(
-            "joaogante/tiny-random-gpt2-with-generation-config", device_map="auto", use_safetensors=False
-        )
-        self.assertEqual(model.generation_config.transformers_version, "foo")
+        model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto")
+        self.assertEqual(model.generation_config.max_length, 2048)
 
     @require_safetensors
     def test_safetensors_torch_from_torch(self):
diff --git a/tests/test_tokenization_utils.py b/tests/utils/test_tokenization_utils.py
similarity index 100%
rename from tests/test_tokenization_utils.py
rename to tests/utils/test_tokenization_utils.py
diff --git a/utils/diff_model_converter.py b/utils/diff_model_converter.py
index e86c6405d48631..f05c57581c8ff1 100644
--- a/utils/diff_model_converter.py
+++ b/utils/diff_model_converter.py
@@ -497,7 +497,7 @@ def leave_ClassDef(self, original_node, updated_node):
                         start_insert_idx -= 1
                         self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node}
                     elif dependency not in self.inserted_deps:
-                        # make sure the node is written after it's dependencies
+                        # make sure the node is written after its dependencies
                         start_insert_idx = self.new_body[dependency]["insert_idx"] - 1
                     self.inserted_deps.append(dependency)
             if len(list_dependencies) > 0: