From 1a585c1222a56bcaecc070966d558d4a9d862e83 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 14 Dec 2023 19:50:20 +0000
Subject: [PATCH 01/30] Remove warning when Annotion enum is created (#28048)

Remove warning when enum is created
---
 src/transformers/image_utils.py               | 25 ++++++++-----------
 .../image_processing_conditional_detr.py      |  1 -
 .../image_processing_deformable_detr.py       |  1 -
 .../models/deta/image_processing_deta.py      |  1 -
 .../models/detr/image_processing_detr.py      |  1 -
 .../models/yolos/image_processing_yolos.py    |  1 -
 6 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 5d280bf5e2b49a..99eac953bc3208 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -15,7 +15,6 @@
 
 import base64
 import os
-from enum import EnumMeta
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -76,16 +75,7 @@ class AnnotationFormat(ExplicitEnum):
     COCO_PANOPTIC = "coco_panoptic"
 
 
-class DeprecatedEnumMeta(EnumMeta):
-    def __init__(cls, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        logger.warning_once(
-            f"`{cls.__name__}` is deprecated and will be removed in v4.38. "
-            f"Please use `transformers.image_utils.AnnotationFormat` instead."
-        )
-
-
-class AnnotionFormat(ExplicitEnum, metaclass=DeprecatedEnumMeta):
+class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
     COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
 
@@ -703,10 +693,17 @@ def validate_annotations(
     supported_annotation_formats: Tuple[AnnotationFormat, ...],
     annotations: List[Dict],
 ) -> None:
-    if promote_annotation_format(annotation_format) not in supported_annotation_formats:
+    if isinstance(annotation_format, AnnotionFormat):
+        logger.warning_once(
+            f"`{annotation_format.__class__.__name__}` is deprecated and will be removed in v4.38. "
+            f"Please use `{AnnotationFormat.__name__}` instead."
+        )
+        annotation_format = promote_annotation_format(annotation_format)
+
+    if annotation_format not in supported_annotation_formats:
         raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
 
-    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_DETECTION:
+    if annotation_format is AnnotationFormat.COCO_DETECTION:
         if not valid_coco_detection_annotations(annotations):
             raise ValueError(
                 "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
@@ -714,7 +711,7 @@ def validate_annotations(
                 "being a list of annotations in the COCO format."
             )
 
-    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_PANOPTIC:
+    if annotation_format is AnnotationFormat.COCO_PANOPTIC:
         if not valid_coco_panoptic_annotations(annotations):
             raise ValueError(
                 "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 23e493e08bf105..2fe33db810890a 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -39,7 +39,6 @@
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
     AnnotationType,
-    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 00cf8eaecfa308..8c40d20c816ad3 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -39,7 +39,6 @@
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
     AnnotationType,
-    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index 1e3ece8e324ad0..bdd7ab11182ee6 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -35,7 +35,6 @@
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
-    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 8b64b9c4d9a46b..24c36c5d102273 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -38,7 +38,6 @@
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
     AnnotationType,
-    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 4b59fd5ef04905..3b0c635c0ee4d6 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -37,7 +37,6 @@
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
     AnnotationType,
-    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,

From 1e2093176515ddfd7a7dc5f77b2bb4d6a1bc3445 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 15 Dec 2023 11:08:27 +0100
Subject: [PATCH 02/30] [`FA-2`] Fix fa-2 issue when passing `config` to
 `from_pretrained` (#28043)

* fix fa-2 issue

* fix test

* Update src/transformers/modeling_utils.py

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>

* clenaer fix

* up

* add more robust tests

* Update src/transformers/modeling_utils.py

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>

* fixup

* Update src/transformers/modeling_utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* pop

* add test

---------

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 12 ++++++++++++
 tests/test_modeling_utils.py       | 25 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 3247c323685815..7e5d3e54e619e8 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2955,6 +2955,18 @@ def from_pretrained(
                 **kwargs,
             )
         else:
+            # In case one passes a config to `from_pretrained` + "attn_implementation"
+            # override the `_attn_implementation` attribute to `attn_implementation` of the kwargs
+            # Please see: https://github.com/huggingface/transformers/issues/28038
+
+            # Overwrite `config._attn_implementation` by the one from the kwargs --> in auto-factory
+            # we pop attn_implementation from the kwargs but this handles the case where users
+            # passes manually the config to `from_pretrained`.
+            config = copy.deepcopy(config)
+
+            kwarg_attn_imp = kwargs.pop("attn_implementation", None)
+            if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp:
+                config._attn_implementation = kwarg_attn_imp
             model_kwargs = kwargs
 
         quantizer = None
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index ddfaad5214dc50..a8a483b4017c84 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -1823,6 +1823,16 @@ def test_error_no_flash_available(self):
 
         self.assertTrue("does not support Flash Attention 2.0" in str(cm.exception))
 
+    def test_error_no_flash_available_with_config(self):
+        with self.assertRaises(ValueError) as cm:
+            config = AutoConfig.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel")
+
+            _ = AutoModel.from_pretrained(
+                "hf-tiny-model-private/tiny-random-MCTCTModel", config=config, attn_implementation="flash_attention_2"
+            )
+
+        self.assertTrue("does not support Flash Attention 2.0" in str(cm.exception))
+
     def test_error_wrong_attn_implementation(self):
         with self.assertRaises(ValueError) as cm:
             _ = AutoModel.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="foo")
@@ -1840,6 +1850,21 @@ def test_not_available_flash(self):
 
         self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception))
 
+    def test_not_available_flash_with_config(self):
+        if is_flash_attn_2_available():
+            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+
+        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-GPTBigCodeModel")
+
+        with self.assertRaises(ImportError) as cm:
+            _ = AutoModel.from_pretrained(
+                "hf-internal-testing/tiny-random-GPTBigCodeModel",
+                config=config,
+                attn_implementation="flash_attention_2",
+            )
+
+        self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception))
+
     def test_not_available_sdpa(self):
         if is_torch_sdpa_available():
             self.skipTest("This test requires torch<=2.0")

From e737446ee65e6c6175e9cf945669bcc2629c2447 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 15 Dec 2023 11:34:42 +0100
Subject: [PATCH 03/30] [`Modeling` / `Mixtral`] Fix GC + PEFT issues with
 Mixtral (#28061)

fix for mistral
---
 .../models/mixtral/modeling_mixtral.py             | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 3ae8567317eff0..c07346c6de19f4 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1016,6 +1016,13 @@ def forward(
 
         past_key_values_length = 0
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         if use_cache:
             use_legacy_cache = not isinstance(past_key_values, Cache)
             if use_legacy_cache:
@@ -1058,13 +1065,6 @@ def forward(
 
         hidden_states = inputs_embeds
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None

From 7e876dca54a367632c2f1b47f1b5171441252742 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 15 Dec 2023 10:57:18 +0000
Subject: [PATCH 04/30] [Flax BERT] Update deprecated 'split' method (#28012)

* [Flax BERT] Update deprecated 'split' method

* fix copies
---
 src/transformers/models/bert/modeling_flax_bert.py              | 2 +-
 src/transformers/models/roberta/modeling_flax_roberta.py        | 2 +-
 .../roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py  | 2 +-
 .../models/xlm_roberta/modeling_flax_xlm_roberta.py             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index d99b908a07380a..b32a618655e600 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -1569,7 +1569,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 9cace41181825c..70a6f540a2352a 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -1344,7 +1344,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index 7fc73e530db4db..c13778c1ac04dd 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -1365,7 +1365,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index e197add6d75d54..e8247b3f28de39 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -1359,7 +1359,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 

From 6af3ce7757e87e7e3380b0405bd0757805d41182 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 15 Dec 2023 10:57:36 +0000
Subject: [PATCH 05/30] [Flax LLaMA] Fix attn dropout (#28059)

---
 src/transformers/models/llama/modeling_flax_llama.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 9430eba41c6548..73fb1cbb955044 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -289,6 +289,10 @@ def __call__(
         attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
         attention_mask = combine_masks(attention_mask, causal_mask)
 
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
         if self.has_variable("cache", "cached_key") or init_cache:
@@ -307,6 +311,8 @@ def __call__(
             query,
             key,
             bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
             deterministic=deterministic,
             dtype=attention_dtype,
         )

From c817c17dbe264329b9f9d227b48ce70edd9e3204 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Fri, 15 Dec 2023 12:15:06 +0000
Subject: [PATCH 06/30] Remove SpeechT5 deprecated argument (#28062)

---
 .../models/speecht5/modeling_speecht5.py          | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 63085bc0463fbe..a327d15623ea4a 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -15,7 +15,6 @@
 """ PyTorch SpeechT5 model."""
 
 import math
-import warnings
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -2656,7 +2655,6 @@ def forward(
         return_dict: Optional[bool] = None,
         speaker_embeddings: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.FloatTensor] = None,
-        stop_labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -2704,12 +2702,6 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if stop_labels is not None:
-            warnings.warn(
-                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
-                FutureWarning,
-            )
-
         if labels is not None:
             if decoder_input_values is None:
                 decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)
@@ -2981,7 +2973,6 @@ def forward(
         return_dict: Optional[bool] = None,
         speaker_embeddings: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.FloatTensor] = None,
-        stop_labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
         r"""
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -3035,12 +3026,6 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if stop_labels is not None:
-            warnings.warn(
-                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
-                FutureWarning,
-            )
-
         if labels is not None:
             if decoder_input_values is None:
                 decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)

From 70a127a37a1d168898ec5631872a7aadeec6176a Mon Sep 17 00:00:00 2001
From: Cylis <cscaiyili@qq.com>
Date: Fri, 15 Dec 2023 21:01:39 +0800
Subject: [PATCH 07/30] doc: Correct spelling mistake (#28064)

---
 docs/source/zh/preprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/zh/preprocessing.md b/docs/source/zh/preprocessing.md
index f2b3189dd4a6c7..266cf0e6b9ef3c 100644
--- a/docs/source/zh/preprocessing.md
+++ b/docs/source/zh/preprocessing.md
@@ -73,7 +73,7 @@ pip install datasets
 `tokenizer`返回一个包含三个重要对象的字典：
 
 * [input_ids](glossary#input-ids) 是与句子中每个`token`对应的索引。
-* [attention_mask](glossary#attention-mask) 指示是否应该关注一个`toekn`。
+* [attention_mask](glossary#attention-mask) 指示是否应该关注一个`token`。
 * [token_type_ids](glossary#token-type-ids) 在存在多个序列时标识一个`token`属于哪个序列。
 
 通过解码 `input_ids` 来返回您的输入：

From d269c4b2d7bbe8f25f3daab818bb13bb5ea4ca45 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 15 Dec 2023 14:05:20 +0100
Subject: [PATCH 08/30] =?UTF-8?q?[`Mixtral`]=C2=A0update=20conversion=20sc?=
 =?UTF-8?q?ript=20to=20reflect=20new=20changes=20(#28068)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update convert_mixtral_weights_to_hf.py

* forward contrib credits from original fix

---------

Co-authored-by: thomasw21 <thomasw21@users.noreply.github.com>
---
 .../models/mixtral/convert_mixtral_weights_to_hf.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
index 53cb8014438165..10b753f4224858 100644
--- a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
+++ b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
@@ -65,7 +65,7 @@ def write_model(model_path, input_base_path, model_size, safe_serialization=True
     num_shards = 1
 
     # For some reason this is a string in the params.json
-    sliding_window = int(params["sliding_window"])
+    sliding_window = int(params["sliding_window"]) if "sliding_window" in params else None
     n_layers = params["num_hidden_layers"]
     n_heads = params["num_attention_heads"]
     n_heads_per_shard = n_heads // num_shards

From deb72cb6d931cebd9f75c4a62a3cb203249d997b Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Fri, 15 Dec 2023 13:39:16 +0000
Subject: [PATCH 09/30] Skip M4T `test_retain_grad_hidden_states_attentions` 
 (#28060)

* skip test from SpeechInput

* refine description of skip
---
 tests/models/seamless_m4t/test_modeling_seamless_m4t.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 0413721ba681dc..68979202d46e6e 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -20,7 +20,7 @@
 import unittest
 
 from transformers import SeamlessM4TConfig, is_speech_available, is_torch_available
-from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 from transformers.trainer_utils import set_seed
 from transformers.utils import cached_property
 
@@ -610,9 +610,11 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
             )
 
-    @is_flaky()
+    @unittest.skip(
+        reason="In training model, the first speech encoder layer is sometimes skipped. Training is not supported yet, so the test is ignored."
+    )
     def test_retain_grad_hidden_states_attentions(self):
-        super().test_retain_grad_hidden_states_attentions()
+        pass
 
 
 @require_torch

From e2b6df79719024f7ba03611000054b9e34540f7b Mon Sep 17 00:00:00 2001
From: Adilzhan Ismailov <13088690+aismlv@users.noreply.github.com>
Date: Fri, 15 Dec 2023 14:05:20 +0000
Subject: [PATCH 10/30] [LLaVa] Add past_key_values to
 _skip_keys_device_placement to fix multi-GPU dispatch (#28051)

Add past_key_values to _skip_keys_device_placement  for LLaVa
---
 src/transformers/models/llava/modeling_llava.py       | 1 +
 src/transformers/models/vipllava/modeling_vipllava.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 3a7dbc198e3732..453ab760b7ec47 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -130,6 +130,7 @@ class LlavaPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["LlavaVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 0b1dc3fa86b383..f9b1d5f3c93a8c 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -137,6 +137,7 @@ class VipLlavaPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["VipLlavaVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
 
     def _init_weights(self, module):

From 74cae670ce542b62c44a5603f0675ff31932793c Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Fri, 15 Dec 2023 09:45:31 -0500
Subject: [PATCH 11/30] Make GPT2 traceable in meta state (#28054)

* Put device in tensor constructor instead of to()

* Fix copy
---
 .../decision_transformer/modeling_decision_transformer.py       | 2 +-
 src/transformers/models/gpt2/modeling_gpt2.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index d07a25c8915877..fdfb5b37d22e62 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -185,7 +185,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
 
         if attention_mask is not None:
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index bc95c774039ffc..494aecaeabe1e3 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -198,7 +198,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
 
         if attention_mask is not None:

From dec84b3211992e20daabe7bcd7e9534b2cc7cc01 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Fri, 15 Dec 2023 16:01:18 +0100
Subject: [PATCH 12/30] make torch.load a bit safer (#27282)

* make torch.load a bit safer

* Fixes

---------

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
---
 src/transformers/convert_pytorch_checkpoint_to_tf2.py | 2 +-
 src/transformers/modeling_flax_pytorch_utils.py       | 4 ++--
 src/transformers/modeling_tf_pytorch_utils.py         | 2 +-
 src/transformers/modeling_utils.py                    | 4 ++--
 src/transformers/models/wav2vec2/modeling_wav2vec2.py | 2 +-
 src/transformers/trainer.py                           | 8 ++++----
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index f1358408a5cb57..f300b0bb92c661 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -329,7 +329,7 @@ def convert_pt_checkpoint_to_tf(
     if compare_with_pt_model:
         tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
-        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu", weights_only=True)
         pt_model = pt_model_class.from_pretrained(
             pretrained_model_name_or_path=None, config=config, state_dict=state_dict
         )
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index f78c4e78c78ba8..f6014d7c208ab6 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -68,7 +68,7 @@ def load_pytorch_checkpoint_in_flax_state_dict(
                 for k in f.keys():
                     pt_state_dict[k] = f.get_tensor(k)
         else:
-            pt_state_dict = torch.load(pt_path, map_location="cpu")
+            pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
         logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
 
         flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
@@ -249,7 +249,7 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
     flax_state_dict = {}
     for shard_file in shard_filenames:
         # load using msgpack utils
-        pt_state_dict = torch.load(shard_file)
+        pt_state_dict = torch.load(shard_file, weights_only=True)
         pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
 
         model_prefix = flax_model.base_model_prefix
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index c599b795bf1932..aca1b9e4d9dccf 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -186,7 +186,7 @@ def load_pytorch_checkpoint_in_tf2_model(
         if pt_path.endswith(".safetensors"):
             state_dict = safe_load_file(pt_path)
         else:
-            state_dict = torch.load(pt_path, map_location="cpu")
+            state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
 
         pt_state_dict.update(state_dict)
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7e5d3e54e619e8..8be9709d072afe 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -480,7 +480,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
             error_message += f"\nMissing key(s): {str_unexpected_keys}."
         raise RuntimeError(error_message)
 
-    loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu")
+    loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", weights_only=True)
 
     for shard_file in shard_files:
         state_dict = loader(os.path.join(folder, shard_file))
@@ -516,7 +516,7 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
         else:
             map_location = "cpu"
 
-        return torch.load(checkpoint_file, map_location=map_location)
+        return torch.load(checkpoint_file, map_location=map_location, weights_only=True)
     except Exception as e:
         try:
             with open(checkpoint_file) as f:
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 3d97e7c73d3522..ddfa2e21263f0f 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1333,7 +1333,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     cache_dir=cache_dir,
                 )
 
-                state_dict = torch.load(weight_path, map_location="cpu")
+                state_dict = torch.load(weight_path, map_location="cpu", weights_only=True)
 
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 3a4ff5528047ae..0b56488907fc17 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2086,7 +2086,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                         logger.warning(
                             "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
                         )
-                    state_dict = torch.load(weights_file, map_location="cpu")
+                    state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
                     # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
@@ -2099,7 +2099,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                 if self.args.save_safetensors and os.path.isfile(safe_weights_file):
                     state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
                 else:
-                    state_dict = torch.load(weights_file, map_location="cpu")
+                    state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
 
                 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
                 # which takes *args instead of **kwargs
@@ -2167,7 +2167,7 @@ def _load_best_model(self):
                     if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
                         state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
                     else:
-                        state_dict = torch.load(best_model_path, map_location="cpu")
+                        state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
 
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
@@ -2196,7 +2196,7 @@ def _load_best_model(self):
                     if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
                         state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
                     else:
-                        state_dict = torch.load(best_model_path, map_location="cpu")
+                        state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
 
                     # If the model is on the GPU, it still works!
                     # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963

From 1c286be5081568a1b26c7c8b42932aa40f3a0c64 Mon Sep 17 00:00:00 2001
From: dumpmemory <64742282+dumpmemory@users.noreply.github.com>
Date: Sat, 16 Dec 2023 00:18:56 +0800
Subject: [PATCH 13/30] Fix bug for checkpoint saving on multi node training
 setting (#28078)

* add multi-node traning setting

* fix style
---
 src/transformers/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0b56488907fc17..ffe5f5c0d1556b 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2386,7 +2386,9 @@ def _save_checkpoint(self, model, trial, metrics=None):
         self.args.distributed_state.wait_for_everyone()
         # Then go through the rewriting process starting on process 0
         if staging_output_dir != output_dir:
-            with self.args.main_process_first(desc="Renaming model checkpoint folder to true location"):
+            with self.args.main_process_first(
+                desc="Renaming model checkpoint folder to true location", local=self.args.save_on_each_node
+            ):
                 if os.path.exists(staging_output_dir):
                     os.rename(staging_output_dir, output_dir)
 

From 26ea725bc0d90c75ba20d2f894321aa98b2c6cf2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Fri, 15 Dec 2023 17:58:36 +0100
Subject: [PATCH 14/30] Update fixtures-image-utils (#28080)

* fix hf-internal-testing/fixtures_image_utils

* fix test

* comments
---
 .../test_image_processing_imagegpt.py         |  8 +++---
 .../test_pipelines_depth_estimation.py        | 10 ++++---
 .../test_pipelines_image_classification.py    | 10 ++++---
 .../test_pipelines_image_segmentation.py      | 10 ++++---
 .../test_pipelines_object_detection.py        | 10 ++++---
 tests/utils/test_image_utils.py               | 27 ++++++++++++-------
 6 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index a806f032435cbb..4596d742a282bc 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -226,10 +226,12 @@ def test_call_pytorch(self):
 
 
 def prepare_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test")
+    # we use revision="refs/pr/1" until the PR is merged
+    # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+    dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
-    image1 = Image.open(dataset[4]["file"])
-    image2 = Image.open(dataset[5]["file"])
+    image1 = dataset[4]["image"]
+    image2 = dataset[5]["image"]
 
     images = [image1, image2]
 
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index 009aa1c942d24a..abc58ca710b8b3 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -68,17 +68,19 @@ def run_pipeline_test(self, depth_estimator, examples):
         self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs)
         import datasets
 
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
         outputs = depth_estimator(
             [
                 Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 # RGBA
-                dataset[0]["file"],
+                dataset[0]["image"],
                 # LA
-                dataset[1]["file"],
+                dataset[1]["image"],
                 # L
-                dataset[2]["file"],
+                dataset[2]["image"],
             ]
         )
         self.assertEqual(
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 7af16371a02083..bec538d53ab33a 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -72,7 +72,9 @@ def run_pipeline_test(self, image_classifier, examples):
 
         import datasets
 
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
         # Accepts URL + PIL.Image + lists
         outputs = image_classifier(
@@ -80,11 +82,11 @@ def run_pipeline_test(self, image_classifier, examples):
                 Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 # RGBA
-                dataset[0]["file"],
+                dataset[0]["image"],
                 # LA
-                dataset[1]["file"],
+                dataset[1]["image"],
                 # L
-                dataset[2]["file"],
+                dataset[2]["image"],
             ]
         )
         self.assertEqual(
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 9c5c8fdfd4a8dd..23a95570abd1c9 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -113,18 +113,20 @@ def run_pipeline_test(self, image_segmenter, examples):
         # to make it work
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
 
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
         # RGBA
-        outputs = image_segmenter(dataset[0]["file"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(dataset[0]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
         # LA
-        outputs = image_segmenter(dataset[1]["file"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(dataset[1]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
         # L
-        outputs = image_segmenter(dataset[2]["file"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(dataset[2]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
 
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index 4196db36d765c8..ec4984b76f99bb 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -73,17 +73,19 @@ def run_pipeline_test(self, object_detector, examples):
 
         import datasets
 
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
         batch = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
             # RGBA
-            dataset[0]["file"],
+            dataset[0]["image"],
             # LA
-            dataset[1]["file"],
+            dataset[1]["image"],
             # L
-            dataset[2]["file"],
+            dataset[2]["image"],
         ]
         batch_outputs = object_detector(batch, threshold=0.0)
 
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 5d899c2f1ddf72..ee45300a7e5279 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -538,9 +538,11 @@ def test_load_img_base64(self):
         self.assertEqual(img_arr.shape, (64, 32, 3))
 
     def test_load_img_rgba(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
-        img = load_image(dataset[0]["file"])  # img with mode RGBA
+        img = load_image(dataset[0]["image"])  # img with mode RGBA
         img_arr = np.array(img)
 
         self.assertEqual(
@@ -549,9 +551,11 @@ def test_load_img_rgba(self):
         )
 
     def test_load_img_la(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
-        img = load_image(dataset[1]["file"])  # img with mode LA
+        img = load_image(dataset[1]["image"])  # img with mode LA
         img_arr = np.array(img)
 
         self.assertEqual(
@@ -560,9 +564,11 @@ def test_load_img_la(self):
         )
 
     def test_load_img_l(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
-        img = load_image(dataset[2]["file"])  # img with mode L
+        img = load_image(dataset[2]["image"])  # img with mode L
         img_arr = np.array(img)
 
         self.assertEqual(
@@ -571,10 +577,11 @@ def test_load_img_l(self):
         )
 
     def test_load_img_exif_transpose(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
-        img_file = dataset[3]["file"]
+        # we use revision="refs/pr/1" until the PR is merged
+        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
 
-        img_without_exif_transpose = PIL.Image.open(img_file)
+        img_without_exif_transpose = dataset[3]["image"]
         img_arr_without_exif_transpose = np.array(img_without_exif_transpose)
 
         self.assertEqual(
@@ -582,7 +589,7 @@ def test_load_img_exif_transpose(self):
             (333, 500, 3),
         )
 
-        img_with_exif_transpose = load_image(img_file)
+        img_with_exif_transpose = load_image(dataset[3]["image"])
         img_arr_with_exif_transpose = np.array(img_with_exif_transpose)
 
         self.assertEqual(

From 29a1c1b472674030d61a6753cf1e3772f5d7131f Mon Sep 17 00:00:00 2001
From: Kotaro Tanahashi <colon0722@gmail.com>
Date: Sat, 16 Dec 2023 02:03:41 +0900
Subject: [PATCH 15/30] Fix `low_cpu_mem_usage` Flag Conflict with DeepSpeed
 Zero 3 in `from_pretrained` for Models with `keep_in_fp32_modules`" (#27762)

Fix `from_pretrained` Logic
for `low_cpu_mem_usage` with DeepSpeed Zero3
---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8be9709d072afe..e0faf378b77f53 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3466,7 +3466,7 @@ def from_pretrained(
 
         # Check first if we are `from_pt`
         if use_keep_in_fp32_modules:
-            if is_accelerate_available():
+            if is_accelerate_available() and not is_deepspeed_zero3_enabled():
                 low_cpu_mem_usage = True
             keep_in_fp32_modules = model._keep_in_fp32_modules
         else:

From ffa04def0e6765cdf1d9518546506325a72e2bbe Mon Sep 17 00:00:00 2001
From: Ligeng Zhu <Lyken17@users.noreply.github.com>
Date: Sat, 16 Dec 2023 01:09:50 +0800
Subject: [PATCH 16/30] Fix wrong examples in llava usage. (#28020)

* Fix wrong examples in llava usage.

* Update modeling_llava.py
---
 src/transformers/models/llava/modeling_llava.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 453ab760b7ec47..821462c63c37c7 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -370,12 +370,12 @@ def forward(
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=text, images=image, return_tensors="pt")
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "There seems to be a stop sign"
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

From 1faeff85ce1b7278f83dd3f131ec24461c3ad752 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 15 Dec 2023 20:16:47 +0100
Subject: [PATCH 17/30] Fix Vip-llava docs (#28085)

* Update vipllava.md

* Update modeling_vipllava.py
---
 docs/source/en/model_doc/vipllava.md              |  4 ++--
 .../models/vipllava/modeling_vipllava.py          | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md
index c5f3c5f55f2c56..35f2467486a895 100644
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@@ -37,13 +37,13 @@ Tips:
 - For better results, we recommend users to prompt the model with the correct prompt format: 
 
 ```bash
-"USER: <image>\n<prompt>ASSISTANT:"
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt>###Assistant:
 ```
 
 For multiple turns conversation:
 
 ```bash
-"USER: <image>\n<prompt1>ASSISTANT: <answer1>USER: <prompt2>ASSISTANT: <answer2>USER: <prompt3>ASSISTANT:"
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt1>###Assistant: <answer1>###Human: <prompt2>###Assistant:
 ```
 
 The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA).
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index f9b1d5f3c93a8c..1ccabd754f9084 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -367,23 +367,26 @@ def forward(
         Example:
 
         ```python
+        >>> import torch
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration
 
-        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vipllava-7b-hf")
-        >>> processor = AutoProcessor.from_pretrained("llava-hf/vipllava-7b-hf")
+        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
 
-        >>> prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
+        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
+        >>> question = "Can you please describe this image?"
+        >>> prompt = prompt.format(question)
         >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=text, images=image, return_tensors="pt")
+        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)
 
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, with a red ball in its paw."
+        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+        The image features a brown and white cat sitting on a green surface, with a red ball in its
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

From 0d63d17765f954ba2b050c1d8be0001e952b7830 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 15 Dec 2023 12:06:55 -0800
Subject: [PATCH 18/30] [docs] Trainer (#27986)

* first draft

* add to toctree

* edits

* feedback
---
 docs/source/en/_toctree.yml            |   2 +
 docs/source/en/main_classes/trainer.md | 335 +-------------------
 docs/source/en/trainer.md              | 408 +++++++++++++++++++++++++
 3 files changed, 416 insertions(+), 329 deletions(-)
 create mode 100644 docs/source/en/trainer.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 09210a471e3acd..b8413b2ebd5a79 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -135,6 +135,8 @@
     title: Overview
   - local: quantization
     title: Quantization
+  - local: trainer
+    title: Trainer
   - sections:
     - local: perf_train_gpu_one
       title: Methods and tools for efficient training on a single GPU
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index cf1dd672d3d472..beb3241e6232a3 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -16,70 +16,23 @@ rendered properly in your Markdown viewer.
 
 # Trainer
 
-The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
+The [`Trainer`] class provides an API for feature-complete training in PyTorch, and it supports distributed training on multiple GPUs/TPUs, mixed precision for [NVIDIA GPUs](https://nvidia.github.io/apex/), [AMD GPUs](https://rocm.docs.amd.com/en/latest/rocm.html), and [`torch.amp`](https://pytorch.org/docs/stable/amp.html) for PyTorch. [`Trainer`] goes hand-in-hand with the [`TrainingArguments`] class, which offers a wide range of options to customize how a model is trained. Together, these two classes provide a complete training API.
 
-<Tip>
-
-If you're looking to fine-tune a language model like Llama-2 or Mistral on a text dataset using autoregressive techniques, consider using [`trl`](https://github.com/huggingface/trl)'s [`~trl.SFTTrainer`]. The [`~trl.SFTTrainer`] wraps the [`Trainer`] and is specially optimized for this particular task and supports sequence packing, LoRA, quantization, and DeepSpeed for efficient scaling to any model size. On the other hand, the [`Trainer`] is a more versatile option, suitable for a broader spectrum of tasks.
-
-</Tip>
-
-Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
-
-The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex] for NVIDIA GPUs, [ROCm APEX](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, and Native AMP for PyTorch.
-
-The [`Trainer`] contains the basic training loop which supports the above features. To inject custom behavior you can subclass them and override the following methods:
-
-- **get_train_dataloader** -- Creates the training DataLoader.
-- **get_eval_dataloader** -- Creates the evaluation DataLoader.
-- **get_test_dataloader** -- Creates the test DataLoader.
-- **log** -- Logs information on the various objects watching training.
-- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
-  init. Note, that you can also subclass or override the `create_optimizer` and `create_scheduler` methods
-  separately.
-- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
-- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
-- **compute_loss** - Computes the loss on a batch of training inputs.
-- **training_step** -- Performs a training step.
-- **prediction_step** -- Performs an evaluation/test step.
-- **evaluate** -- Runs an evaluation loop and returns metrics.
-- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
+[`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] inherit from the [`Trainer`] and [`TrainingArgument`] classes and they're adapted for training models for sequence-to-sequence tasks such as summarization or translation.
 
 <Tip warning={true}>
 
 The [`Trainer`] class is optimized for 🤗 Transformers models and can have surprising behaviors
-when you use it on other models. When using it on your own model, make sure:
+when used with other models. When using it with your own model, make sure:
 
-- your model always return tuples or subclasses of [`~utils.ModelOutput`].
+- your model always return tuples or subclasses of [`~utils.ModelOutput`]
 - your model can compute the loss if a `labels` argument is provided and that loss is returned as the first
   element of the tuple (if your model returns tuples)
-- your model can accept multiple label arguments (use the `label_names` in your [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`.
+- your model can accept multiple label arguments (use `label_names` in [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`
 
 </Tip>
 
-Here is an example of how to customize [`Trainer`] to use a weighted loss (useful when you have an unbalanced training set):
-
-```python
-from torch import nn
-from transformers import Trainer
-
-
-class CustomTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        labels = inputs.pop("labels")
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.get("logits")
-        # compute custom loss (suppose one has 3 labels with different weights)
-        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
-        return (loss, outputs) if return_outputs else loss
-```
-
-Another way to customize the training loop behavior for the PyTorch [`Trainer`] is to use [callbacks](callback) that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early stopping).
-
-
-## Trainer
+## Trainer[[api-reference]]
 
 [[autodoc]] Trainer
     - all
@@ -100,105 +53,6 @@ Another way to customize the training loop behavior for the PyTorch [`Trainer`]
 [[autodoc]] Seq2SeqTrainingArguments
     - all
 
-## Checkpoints
-
-By default, [`Trainer`] will save all checkpoints in the `output_dir` you set in the
-[`TrainingArguments`] you are using. Those will go in subfolder named `checkpoint-xxx` with xxx
-being the step at which the training was at.
-
-Resuming training from a checkpoint can be done when calling [`Trainer.train`] with either:
-
-- `resume_from_checkpoint=True` which will resume training from the latest checkpoint
-- `resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
-  passed.
-
-In addition, you can easily save your checkpoints on the Model Hub when using `push_to_hub=True`. By default, all
-the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
-the `hub-strategy` value of your [`TrainingArguments`] to either:
-
-- `"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
-  resume training easily with `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
-- `"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
-  checkpoint folder per folder in your final repository)
-
-
-## Logging
-
-By default [`Trainer`] will use `logging.INFO` for the main process and `logging.WARNING` for the replicas if any.
-
-These defaults can be overridden to use any of the 5 `logging` levels with [`TrainingArguments`]'s
-arguments:
-
-- `log_level` - for the main process
-- `log_level_replica` - for the replicas
-
-Further, if [`TrainingArguments`]'s `log_on_each_node` is set to `False` only the main node will
-use the log level settings for its main process, all other nodes will use the log level settings for replicas.
-
-Note that [`Trainer`] is going to set `transformers`'s log level separately for each node in its
-[`Trainer.__init__`]. So you may want to set this sooner (see the next example) if you tap into other
-`transformers` functionality before creating the [`Trainer`] object.
-
-Here is an example of how this can be used in an application:
-
-```python
-[...]
-logger = logging.getLogger(__name__)
-
-# Setup logging
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    handlers=[logging.StreamHandler(sys.stdout)],
-)
-
-# set the main code and the modules it uses to the same log-level according to the node
-log_level = training_args.get_process_log_level()
-logger.setLevel(log_level)
-datasets.utils.logging.set_verbosity(log_level)
-transformers.utils.logging.set_verbosity(log_level)
-
-trainer = Trainer(...)
-```
-
-And then if you only want to see warnings on the main node and all other nodes to not print any most likely duplicated
-warnings you could run it as:
-
-```bash
-my_app.py ... --log_level warning --log_level_replica error
-```
-
-In the multi-node environment if you also don't want the logs to repeat for each node's main process, you will want to
-change the above to:
-
-```bash
-my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
-```
-
-and then only the main process of the first node will log at the "warning" level, and all other processes on the main
-node and all processes on other nodes will log at the "error" level.
-
-If you need your application to be as quiet as possible you could do:
-
-```bash
-my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
-```
-
-(add `--log_on_each_node 0` if on multi-node environment)
-
-
-## Randomness
-
-When resuming from a checkpoint generated by [`Trainer`] all efforts are made to restore the
-_python_, _numpy_ and _pytorch_ RNG states to the same states as they were at the moment of saving that checkpoint,
-which should make the "stop and resume" style of training as close as possible to non-stop training.
-
-However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
-determinism please refer to [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness). As explained in the document, that some of those settings
-that make things deterministic (.e.g., `torch.backends.cudnn.deterministic`) may slow things down, therefore this
-can't be done by default, but you can enable those yourself if needed.
-
-
 ## Specific GPUs Selection
 
 Let's discuss how you can tell your program which GPUs are to be used and in what order.
@@ -295,9 +149,6 @@ In this example we are working with just 2 GPUs, but of course the same would ap
 
 Also if you do set this environment variable it's the best to set it in your `~/.bashrc` file or some other startup config file and forget about it.
 
-
-
-
 ## Trainer Integrations
 
 The [`Trainer`] has been extended to support libraries that may dramatically improve your training
@@ -579,156 +430,6 @@ Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, ther
 have any problems or questions with regards to MPS backend usage, please, 
 file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
 
-
-## Using Accelerate Launcher with Trainer
-
-Accelerate now powers Trainer. In terms of what users should expect:
-- They can keep using the Trainer ingterations such as FSDP, DeepSpeed vis trainer arguments without any changes on their part.
-- They can now use Accelerate Launcher with Trainer (recommended).
-
-Steps to use Accelerate Launcher with Trainer:
-1. Make sure 🤗 Accelerate is installed, you can't use the `Trainer` without it anyway. If not `pip install accelerate`. You may also need to update your version of Accelerate: `pip install accelerate --upgrade`
-2. Run `accelerate config` and fill the questionnaire. Below are example accelerate configs:
-  a. DDP Multi-node Multi-GPU config:
-    ```yaml
-    compute_environment: LOCAL_MACHINE                                                                                             
-    distributed_type: MULTI_GPU                                                                                                    
-    downcast_bf16: 'no'
-    gpu_ids: all
-    machine_rank: 0 #change rank as per the node
-    main_process_ip: 192.168.20.1
-    main_process_port: 9898
-    main_training_function: main
-    mixed_precision: fp16
-    num_machines: 2
-    num_processes: 8
-    rdzv_backend: static
-    same_network: true
-    tpu_env: []
-    tpu_use_cluster: false
-    tpu_use_sudo: false
-    use_cpu: false
-    ```
-
-  b. FSDP config:
-    ```yaml
-    compute_environment: LOCAL_MACHINE
-    distributed_type: FSDP
-    downcast_bf16: 'no'
-    fsdp_config:
-      fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-      fsdp_backward_prefetch_policy: BACKWARD_PRE
-      fsdp_forward_prefetch: true
-      fsdp_offload_params: false
-      fsdp_sharding_strategy: 1
-      fsdp_state_dict_type: FULL_STATE_DICT
-      fsdp_sync_module_states: true
-      fsdp_transformer_layer_cls_to_wrap: BertLayer
-      fsdp_use_orig_params: true
-    machine_rank: 0
-    main_training_function: main
-    mixed_precision: bf16
-    num_machines: 1
-    num_processes: 2
-    rdzv_backend: static
-    same_network: true
-    tpu_env: []
-    tpu_use_cluster: false
-    tpu_use_sudo: false
-    use_cpu: false
-    ```
-  c. DeepSpeed config pointing to a file:
-    ```yaml
-    compute_environment: LOCAL_MACHINE
-    deepspeed_config:
-      deepspeed_config_file: /home/user/configs/ds_zero3_config.json
-      zero3_init_flag: true
-    distributed_type: DEEPSPEED
-    downcast_bf16: 'no'
-    machine_rank: 0
-    main_training_function: main
-    num_machines: 1
-    num_processes: 4
-    rdzv_backend: static
-    same_network: true
-    tpu_env: []
-    tpu_use_cluster: false
-    tpu_use_sudo: false
-    use_cpu: false
-    ```
-
-  d. DeepSpeed config using accelerate plugin:
-    ```yaml
-    compute_environment: LOCAL_MACHINE                                                                                             
-    deepspeed_config:                                                                                                              
-      gradient_accumulation_steps: 1
-      gradient_clipping: 0.7
-      offload_optimizer_device: cpu
-      offload_param_device: cpu
-      zero3_init_flag: true
-      zero_stage: 2
-    distributed_type: DEEPSPEED
-    downcast_bf16: 'no'
-    machine_rank: 0
-    main_training_function: main
-    mixed_precision: bf16
-    num_machines: 1
-    num_processes: 4
-    rdzv_backend: static
-    same_network: true
-    tpu_env: []
-    tpu_use_cluster: false
-    tpu_use_sudo: false
-    use_cpu: false
-    ```
-
-3. Run the Trainer script with args other than the ones handled above by accelerate config or launcher args.
-Below is an example to run `run_glue.py` using `accelerate launcher` with FSDP config from above. 
-
-```bash
-cd transformers
-
-accelerate launch \
-./examples/pytorch/text-classification/run_glue.py \
---model_name_or_path bert-base-cased \
---task_name $TASK_NAME \
---do_train \
---do_eval \
---max_seq_length 128 \
---per_device_train_batch_size 16 \
---learning_rate 5e-5 \
---num_train_epochs 3 \
---output_dir /tmp/$TASK_NAME/ \
---overwrite_output_dir
-```
-
-4. You can also directly use the cmd args for `accelerate launch`. Above example would map to:
-
-```bash
-cd transformers
-
-accelerate launch --num_processes=2 \
---use_fsdp \
---mixed_precision=bf16 \
---fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
---fsdp_transformer_layer_cls_to_wrap="BertLayer" \
---fsdp_sharding_strategy=1 \
---fsdp_state_dict_type=FULL_STATE_DICT \
-./examples/pytorch/text-classification/run_glue.py
---model_name_or_path bert-base-cased \
---task_name $TASK_NAME \
---do_train \
---do_eval \
---max_seq_length 128 \
---per_device_train_batch_size 16 \
---learning_rate 5e-5 \
---num_train_epochs 3 \
---output_dir /tmp/$TASK_NAME/ \
---overwrite_output_dir
-```
-
-For more information, please refer the 🤗 Accelerate CLI guide: [Launching your 🤗 Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch).
-
 Sections that were moved:
 
 [ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
@@ -755,27 +456,3 @@ Sections that were moved:
 | <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
 | <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
 ]
-
-## Boost your fine-tuning performances using NEFTune
-
-
-NEFTune is a technique to boost the performance of chat models and was introduced by the paper “NEFTune: Noisy Embeddings Improve Instruction Finetuning” from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
-
-> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
-</div>
-
-To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
-
-```python
-from transformers import Trainer, TrainingArguments
-
-args = TrainingArguments(..., neftune_noise_alpha=0.1)
-trainer = Trainer(..., args=args)
-
-...
-
-trainer.train()
-```
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
new file mode 100644
index 00000000000000..cb5e2631a2b550
--- /dev/null
+++ b/docs/source/en/trainer.md
@@ -0,0 +1,408 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer
+
+The [`Trainer`] is a complete training and evaluation loop for PyTorch models implemented in the Transformers library. You only need to pass it the necessary pieces for training (model, tokenizer, dataset, evaluation function, training hyperparameters, etc.), and the [`Trainer`] class takes care of the rest. This makes it easier to start training faster without manually writing your own training loop. But at the same time, [`Trainer`] is very customizable and offers a ton of training options so you can tailor it to your exact training needs.
+
+<Tip>
+
+In addition to the [`Trainer`] class, Transformers also provides a [`Seq2SeqTrainer`] class for sequence-to-sequence tasks like translation or summarization. There is also the [`~trl.SFTTrainer`] class from the [TRL](https://hf.co/docs/trl) library which wraps the [`Trainer`] class and is optimized for training language models like Llama-2 and Mistral with autoregressive techniques. [`~trl.SFTTrainer`] also supports features like sequence packing, LoRA, quantization, and DeepSpeed for efficiently scaling to any model size.
+
+<br>
+
+Feel free to check out the [API reference](./main_classes/trainer) for these other [`Trainer`]-type classes to learn more about when to use which one. In general, [`Trainer`] is the most versatile option and is appropriate for a broad spectrum of tasks. [`Seq2SeqTrainer`] is designed for sequence-to-sequence tasks and [`~trl.SFTTrainer`] is designed for training language models.
+
+</Tip>
+
+Before you start, make sure [Accelerate](https://hf.co/docs/accelerate) - a library for enabling and running PyTorch training across distributed environments - is installed.
+
+```bash
+pip install accelerate
+
+# upgrade
+pip install accelerate --upgrade
+```
+
+This guide provides an overview of the [`Trainer`] class.
+
+## Basic usage
+
+[`Trainer`] includes all the code you'll find in a basic training loop:
+
+1. perform a training step to calculate the loss
+2. calculate the gradients with the [`~accelerate.Accelerator.backward`] method
+3. update the weights based on the gradients
+4. repeat this process until you've reached a predetermined number of epochs
+
+The [`Trainer`] class abstracts all of this code away so you don't have to worry about manually writing a training loop every time or if you're just getting started with PyTorch and training. You only need to provide the essential components required for training, such as a model and a dataset, and the [`Trainer`] class handles everything else.
+
+If you want to specify any training options or hyperparameters, you can find them in the [`TrainingArguments`] class. For example, let's define where to save the model in `output_dir` and push the model to the Hub after training with `push_to_hub=True`.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+```
+
+Pass `training_args` to the [`Trainer`] along with a model, dataset, something to preprocess the dataset with (depending on your data type it could be a tokenizer, feature extractor or image processor), a data collator, and a function to compute the metrics you want to track during training.
+
+Finally, call [`~Trainer.train`] to start training!
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+### Checkpoints
+
+The [`Trainer`] class saves your model checkpoints to the directory specified in the `output_dir` parameter of [`TrainingArguments`]. You'll find the checkpoints saved in a `checkpoint-000` subfolder where the numbers at the end correspond to the training step. Saving checkpoints are useful for resuming training later.
+
+```py
+# resume from latest checkpoint
+trainer.train(resume_from_checkpoint=True)
+
+# resume from specific checkpoint saved in output directory
+trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
+```
+
+You can save your checkpoints (the optimizer state is not saved by default) to the Hub by setting `push_to_hub=True` in [`TrainingArguments`] to commit and push them. Other options for deciding how your checkpoints are saved are set up in the [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy) parameter:
+
+* `hub_strategy="checkpoint"` pushes the latest checkpoint to a subfolder named "last-checkpoint" from which you can resume training
+* `hug_strategy="all_checkpoints"` pushes all checkpoints to the directory defined in `output_dir` (you'll see one checkpoint per folder in your model repository)
+
+When you resume training from a checkpoint, the [`Trainer`] tries to keep the Python, NumPy, and PyTorch RNG states the same as they were when the checkpoint was saved. But because PyTorch has various non-deterministic default settings, the RNG states aren't guaranteed to be the same. If you want to enable full determinism, take a look at the [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) guide to learn what you can enable to make your training fully deterministic. Keep in mind though that by making certain settings deterministic, training may be slower.
+
+## Customize the Trainer
+
+While the [`Trainer`] class is designed to be accessible and easy-to-use, it also offers a lot of customizability for more adventurous users. Many of the [`Trainer`]'s method can be subclassed and overridden to support the functionality you want, without having to rewrite the entire training loop from scratch to accommodate it. These methods include:
+
+* [`~Trainer.get_train_dataloader`] creates a training DataLoader
+* [`~Trainer.get_eval_dataloader`] creates an evaluation DataLoader
+* [`~Trainer.get_test_dataloader`] creates a test DataLoader
+* [`~Trainer.log`] logs information on the various objects that watch training
+* [`~Trainer.create_optimizer_and_scheduler`] creates an optimizer and learning rate scheduler if they weren't passed in the `__init__`; these can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.create_scheduler`] respectively
+* [`~Trainer.compute_loss`] computes the loss on a batch of training inputs
+* [`~Trainer.training_step`] performs the training step
+* [`~Trainer.prediction_step`] performs the prediction and test step
+* [`~Trainer.evaluate`] evaluates the model and returns the evaluation metrics
+* [`~Trainer.predict`] makes predictions (with metrics if labels are available) on the test set
+
+For example, if you want to customize the [`~Trainer.compute_loss`] method to use a weighted loss instead.
+
+```py
+from torch import nn
+from transformers import Trainer
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss for 3 labels with different weights
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+### Callbacks
+
+Another option for customizing the [`Trainer`] is to use [callbacks](callbacks). Callbacks *don't change* anything in the training loop. They inspect the training loop state and then execute some action (early stopping, logging results, etc.) depending on the state. In other words, a callback can't be used to implement something like a custom loss function and you'll need to subclass and override the [`~Trainer.compute_loss`] method for that.
+
+For example, if you want to add an early stopping callback to the training loop after 10 steps.
+
+```py
+from transformers import TrainerCallback
+
+class EarlyStoppingCallback(TrainerCallback):
+    def __init__(self, num_steps=10):
+        self.num_steps = num_steps
+    
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step >= self.num_steps:
+            return {"should_training_stop": True}
+        else:
+            return {}
+```
+
+Then pass it to the [`Trainer`]'s `callback` parameter.
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callback=[EarlyStoppingCallback()],
+)
+```
+
+## Logging
+
+<Tip>
+
+Check out the [logging](./main_classes/logging) API reference for more information about the different logging levels.
+
+</Tip>
+
+The [`Trainer`] is set to `logging.INFO` by default which reports errors, warnings, and other basic information. A [`Trainer`] replica - in distributed environments - is set to `logging.WARNING` which only reports errors and warnings. You can change the logging level with the [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) and [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) parameters in [`TrainingArguments`].
+
+To configure the log level setting for each node, use the [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) parameter to determine whether to use the log level on each node or only on the main node.
+
+<Tip>
+
+[`Trainer`] sets the log level separately for each node in the [`Trainer.__init__`] method, so you may want to consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] object.
+
+</Tip>
+
+For example, to set your main code and modules to use the same log level according to each node:
+
+```py
+logger = logging.getLogger(__name__)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+Use different combinations of `log_level` and `log_level_replica` to configure what gets logged on each of the nodes.
+
+<hfoptions id="logging">
+<hfoption id="single node">
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+</hfoption>
+<hfoption id="multi-node">
+
+Add the `log_on_each_node 0` parameter for multi-node environments.
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+
+# set to only report errors
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+</hfoption>
+</hfoptions>
+
+## NEFTune
+
+[NEFTune](https://hf.co/papers/2310.05914) is a technique that can improve performance by adding noise to the embedding vectors during training. To enable it in [`Trainer`], set the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
+
+```py
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=training_args)
+```
+
+NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior.
+
+## Accelerate and Trainer
+
+The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
+
+To use Accelerate with [`Trainer`], run the [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to set up training for your training environment. This command creates a `config_file.yaml` that'll be used when you launch your training script. For example, some example configurations you can setup are:
+
+<hfoptions id="config">
+<hfoption id="DistributedDataParallel">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+distributed_type: MULTI_GPU                                                                                                    
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0 #change rank as per the node
+main_process_ip: 192.168.20.1
+main_process_port: 9898
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="FSDP">
+
+```yml
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+```yml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+  zero3_init_flag: true
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed with Accelerate plugin">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+deepspeed_config:                                                                                                              
+  gradient_accumulation_steps: 1
+  gradient_clipping: 0.7
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+</hfoptions>
+
+The [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command is the recommended way to launch your training script on a distributed system with Accelerate and [`Trainer`] with the parameters specified in `config_file.yaml`. This file is saved to the Accelerate cache folder and automatically loaded when you run `accelerate_launch`.
+
+For example, to run the [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) training script with the FSDP configuration:
+
+```bash
+accelerate launch \
+    ./examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+You could also specify the parameters from the `config_file.yaml` file directly in the command line:
+
+```bash
+accelerate launch --num_processes=2 \
+    --use_fsdp \
+    --mixed_precision=bf16 \
+    --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+    --fsdp_sharding_strategy=1 \
+    --fsdp_state_dict_type=FULL_STATE_DICT \
+    ./examples/pytorch/text-classification/run_glue.py
+    --model_name_or_path bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+Check out the [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) tutorial to learn more about `accelerate_launch` and custom configurations.

From ebfdb9ca62205279d5019ef1403877461b3b2da4 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 15 Dec 2023 13:17:29 -0800
Subject: [PATCH 19/30] [docs] MPS (#28016)

* mps docs

* toctree
---
 docs/source/en/_toctree.yml            |  2 +-
 docs/source/en/main_classes/trainer.md | 61 --------------------------
 docs/source/en/perf_train_special.md   | 47 ++++++++++++++++++--
 3 files changed, 44 insertions(+), 66 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b8413b2ebd5a79..57ab9769b60f80 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -151,7 +151,7 @@
     - local: perf_train_tpu_tf
       title: Training on TPU with TensorFlow
     - local: perf_train_special
-      title: Training on Specialized Hardware
+      title: PyTorch training on Apple silicon
     - local: perf_hardware
       title: Custom hardware for training
     - local: hpo_train
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index beb3241e6232a3..2b2f5c3d5f8865 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -369,67 +369,6 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co
   - For size based auto wrap policy, please add `min_num_params` in the config file. 
     It specifies FSDP's minimum number of parameters for auto wrapping.
 
-
-### Using Trainer for accelerated PyTorch Training on Mac 
-
-With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
-This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
-Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
-This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
-For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
-and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 
-
-<Tip warning={false}>
-
-We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
-It has major fixes related to model correctness and performance improvements for transformer based models.
-Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
-
-</Tip>
-
-**Benefits of Training and Inference using Apple Silicon Chips**
-
-1. Enables users to train larger networks or batch sizes locally
-2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
-Therefore, improving end-to-end performance.
-3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
-
-**Pre-requisites**: To install torch with mps support, 
-please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
-
-**Usage**:
-`mps` device will be used by default if available similar to the way `cuda` device is used.
-Therefore, no action from user is required. 
-For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
-
-```bash
-export TASK_NAME=mrpc
-
-python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --max_seq_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$TASK_NAME/ \
-  --overwrite_output_dir
-```
-
-**A few caveats to be aware of**
-
-1. Some PyTorch operations have not been implemented in mps and will throw an error. 
-One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`, 
-which will fallback to CPU for these operations. It still throws a UserWarning however.
-2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
-This means that currently only single GPU of `mps` device type can be used.
-
-Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
-have any problems or questions with regards to MPS backend usage, please, 
-file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
-
 Sections that were moved:
 
 [ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md
index 48727b24fef3e3..b9bbe32897dbd6 100644
--- a/docs/source/en/perf_train_special.md
+++ b/docs/source/en/perf_train_special.md
@@ -13,12 +13,51 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Training on Specialized Hardware
+# PyTorch training on Apple silicon
 
-<Tip>
+Previously, training models on a Mac was limited to the CPU only. With the release of PyTorch v1.12, you can take advantage of training models with Apple's silicon GPUs for significantly faster performance and training. This is powered in PyTorch by integrating Apple's Metal Performance Shaders (MPS) as a backend. The [MPS backend](https://pytorch.org/docs/stable/notes/mps.html) implements PyTorch operations as custom Metal shaders and places these modules on a `mps` device.
 
- Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
+<Tip warning={true}>
+
+Some PyTorch operations are not implemented in MPS yet and will throw an error. To avoid this, you should set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU kernels instead (you'll still see a `UserWarning`).
+
+<br>
+
+If you run into any other errors, please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository because the [`Trainer`] only integrates the MPS backend.
 
 </Tip>
 
-This document will be completed soon with information on how to train on specialized hardware.
+With the `mps` device set, you can:
+
+* train larger networks or batch sizes locally
+* reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store
+* reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs
+
+Get started by making sure you have PyTorch installed. MPS acceleration is supported on macOS 12.3+.
+
+```bash
+pip install torch torchvision torchaudio
+```
+
+[`TrainingArguments`] uses the `mps` device by default if it's available which means you don't need to explicitly set the device. For example, you can run the [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) script with the MPS backend automatically enabled without making any changes.
+
+```diff
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+- --use_mps_device \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --overwrite_output_dir
+```
+
+Backends for [distributed setups](https://pytorch.org/docs/stable/distributed.html#backends) like `gloo` and `nccl` are not supported by the `mps` device which means you can only train on a single GPU with the MPS backend.
+
+You can learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post.

From 238d2e3c44366aba9dc5c770c95475765a6725cb Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Sat, 16 Dec 2023 19:41:43 +0530
Subject: [PATCH 20/30] fix resuming from ckpt when using FSDP with
 FULL_STATE_DICT (#27891)

* fix resuming from ckpt when suing FSDP with FULL_STATE_DICT

* update tests

* fix tests
---
 src/transformers/trainer.py | 13 +++++++++----
 tests/fsdp/test_fsdp.py     | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index ffe5f5c0d1556b..9cd0bf0685e6c9 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2033,10 +2033,15 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
         weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
         safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
         safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
-        is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and any(
-            FSDP_MODEL_NAME in folder_name
-            for folder_name in os.listdir(resume_from_checkpoint)
-            if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
+        is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and (
+            # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
+            any(
+                FSDP_MODEL_NAME in folder_name
+                for folder_name in os.listdir(resume_from_checkpoint)
+                if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
+            )
+            # this checks the FSDP state dict when `FULL_STATE_DICT` is used
+            or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin"))
         )
 
         if is_fsdp_ckpt and not self.is_fsdp_enabled:
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index 2a9473c862ffa9..d883f29ed3698c 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -41,6 +41,7 @@
 
 if is_torch_available():
     from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_1
+    from transformers.trainer import FSDP_MODEL_NAME
 else:
     is_torch_greater_or_equal_than_2_1 = False
 
@@ -211,6 +212,19 @@ def test_training_and_can_resume_normally(self, state_dict_type):
         # resume from ckpt
         checkpoint = os.path.join(output_dir, "checkpoint-115")
         resume_args = args + f"--resume_from_checkpoint {checkpoint}".split()
+
+        is_fsdp_ckpt = os.path.isdir(checkpoint) and (
+            # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
+            any(
+                FSDP_MODEL_NAME in folder_name
+                for folder_name in os.listdir(checkpoint)
+                if os.path.isdir(os.path.join(checkpoint, folder_name))
+            )
+            # this checks the FSDP state dict when `FULL_STATE_DICT` is used
+            or os.path.isfile(os.path.join(checkpoint, f"{FSDP_MODEL_NAME}.bin"))
+        )
+        self.assertTrue(is_fsdp_ckpt)
+
         logs_resume = self.run_cmd_and_get_logs(
             use_accelerate, sharding_strategy, launcher, script, resume_args, output_dir
         )

From f85a1e82c16232d9a2891148d1d4a1c9247ca890 Mon Sep 17 00:00:00 2001
From: Poedator <24738311+poedator@users.noreply.github.com>
Date: Sun, 17 Dec 2023 13:08:04 +0300
Subject: [PATCH 21/30] 4D `attention_mask` support (#27539)

* edits to _prepare_4d_causal_attention_mask()

* initial tests for 4d mask

* attention_mask_for_sdpa support

* added test for inner model hidden

* added autotest decorators

* test mask dtype to torch.int64

* torch.testing.assert_close

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* torch_device and @torch_gpu in tests

* upd tests

* +torch decorators

* torch decorators fixed

* more decorators!

* even more decorators

* fewer decorators

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/modeling_attn_mask_utils.py |  31 ++++-
 tests/test_modeling_utils.py                 | 133 +++++++++++++++++++
 2 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 734f443e1fc9d4..f0964f94028529 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -302,10 +302,22 @@ def _prepare_4d_causal_attention_mask(
     key_value_length = input_shape[-1] + past_key_values_length
 
     # 4d mask is passed through the layers
-    if attention_mask is not None:
+    if attention_mask is not None and len(attention_mask.shape) == 2:
         attention_mask = attn_mask_converter.to_4d(
             attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
         )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
     else:
         attention_mask = attn_mask_converter.to_causal_4d(
             input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
@@ -340,7 +352,22 @@ def _prepare_4d_causal_attention_mask_for_sdpa(
     is_tracing = torch.jit.is_tracing()
 
     if attention_mask is not None:
-        if torch.all(attention_mask == 1):
+        # 4d mask is passed through
+        if len(attention_mask.shape) == 4:
+            expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+            if tuple(attention_mask.shape) != expected_shape:
+                raise ValueError(
+                    f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+                )
+            else:
+                # if the 4D mask has correct shape - invert it and fill with negative infinity
+                inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype)
+                attention_mask = inverted_mask.masked_fill(
+                    inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+                )
+                return attention_mask
+
+        elif torch.all(attention_mask == 1):
             if is_tracing:
                 pass
             elif query_length == 1:
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index a8a483b4017c84..1f632882f02b38 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import gc
 import glob
 import json
 import os
@@ -49,6 +50,7 @@
     require_tf,
     require_torch,
     require_torch_accelerator,
+    require_torch_gpu,
     require_torch_multi_accelerator,
     require_usr_bin_time,
     slow,
@@ -1875,3 +1877,134 @@ def test_not_available_sdpa(self):
             )
 
         self.assertTrue("PyTorch SDPA requirements in Transformers are not met" in str(cm.exception))
+
+
+@slow
+@require_torch_gpu
+class Mask4DTestBase(unittest.TestCase):
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_test_data(self):
+        texts = ["the cat sat", "the cat had", "the cat is"]
+        encoded = [self.tokenizer.encode(t) for t in texts]
+        input_0 = torch.tensor(encoded, device=torch_device)
+        # tensor([[   1,  278, 6635, 3290],
+        # [   1,  278, 6635,  750],
+        # [   1,  278, 6635,  338]], device='cuda:0')
+
+        # Combining common prefix with the unique ending tokens:
+        input_1 = torch.cat([input_0[0][:-1], input_0[:, -1]]).unsqueeze(0)
+        # tensor([[   1,  278, 6635, 3290,  750,  338]], device='cuda:0')
+
+        # Creating a 4D mask where each of the last 3 tokens do not attend to each other.
+        mask_1 = torch.tensor(
+            [
+                [
+                    [
+                        [1, 0, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0],
+                        [1, 1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 1, 0],
+                        [1, 1, 1, 0, 0, 1],
+                    ]
+                ]
+            ],
+            device="cuda:0",
+            dtype=torch.int64,
+        )
+
+        # Creating a position_ids tensor. note the repeating figures in the end.
+        position_ids_1 = torch.tensor([[0, 1, 2, 3, 3, 3]], device=torch_device, dtype=torch.int64)
+
+        return input_0, input_1, mask_1, position_ids_1
+
+
+@slow
+@require_torch_gpu
+class Mask4DTestFP32(Mask4DTestBase):
+    def setUp(self):
+        model_name = "JackFram/llama-68m"  # small Llama-like model from FlexFlow
+        model_dtype = torch.float32
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype).to(torch_device)
+
+    def test_attention(self):
+        """comparing outputs of attention layer"""
+        input_0, input_1, mask_1, position_ids_1 = self.get_test_data()
+
+        hid_0 = self.model.model.embed_tokens(input_0)
+        outs_0 = self.model.model.layers[0].self_attn.forward(hid_0)[0]
+        # outs_0.shape == torch.Size([3, 4, 768])
+
+        hid_1 = self.model.model.embed_tokens(input_1)
+        outs_1 = self.model.model.layers[0].self_attn.forward(
+            hid_1, attention_mask=mask_1.bool(), position_ids=position_ids_1
+        )[0]
+        # outs_1.shape == torch.Size([1, 6, 768])
+
+        outs_0_last_tokens = outs_0[:, -1, :]  # last tokens in each batch line
+        outs_1_last_tokens = outs_1[0, -3:, :]  # last three tokens
+        assert torch.allclose(outs_0_last_tokens, outs_1_last_tokens)
+
+    def test_inner_model(self):
+        """comparing hidden outputs of whole inner model"""
+        input_0, input_1, mask_1, position_ids_1 = self.get_test_data()
+
+        logits_0 = self.model.forward(input_0).logits
+        logits_1 = self.model.forward(input_1, attention_mask=mask_1.bool(), position_ids=position_ids_1).logits
+
+        logits_0_last_tokens = logits_0[:, -1, :]  # last tokens in each batch line
+        logits_1_last_tokens = logits_1[0, -3:, :]  # last three tokens
+        torch.testing.assert_close(
+            logits_0_last_tokens,
+            logits_1_last_tokens,
+        )
+
+    def test_causal_model_logits(self):
+        """comparing logits outputs of whole inner model"""
+        input_0, input_1, mask_1, position_ids_1 = self.get_test_data()
+
+        logits_0 = self.model.forward(input_0).logits
+        logits_1 = self.model.forward(input_1, attention_mask=mask_1.bool(), position_ids=position_ids_1).logits
+
+        logits_0_last_tokens = logits_0[:, -1, :]  # last tokens in each batch line
+        logits_1_last_tokens = logits_1[0, -3:, :]  # last three tokens
+        torch.testing.assert_close(
+            logits_0_last_tokens,
+            logits_1_last_tokens,
+        )
+
+
+@slow
+@require_torch_gpu
+class Mask4DTestFP16(Mask4DTestBase):
+    test_attention = Mask4DTestFP32.test_attention
+
+    def setUp(self):
+        model_name = "JackFram/llama-68m"  # small Llama-like model from FlexFlow
+        model_dtype = torch.float16
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype).to(torch_device)
+
+    def test_causal_model_logits(self):
+        """comparing logits outputs of whole inner model"""
+        input_0, input_1, mask_1, position_ids_1 = self.get_test_data()
+
+        logits_0 = self.model.forward(input_0).logits
+        logits_1 = self.model.forward(input_1, attention_mask=mask_1.bool(), position_ids=position_ids_1).logits
+
+        logits_0_last_tokens = logits_0[:, -1, :]  # last tokens in each batch line
+        logits_1_last_tokens = logits_1[0, -3:, :]  # last three tokens
+
+        indices_0 = logits_0_last_tokens.sort(descending=True).indices
+        indices_1 = logits_1_last_tokens.sort(descending=True).indices
+
+        # checking logits, but note relaxed tolerances for FP16
+        torch.testing.assert_close(logits_0_last_tokens, logits_1_last_tokens, atol=0.02, rtol=0.001)
+
+        # checking tokens order for the top tokens
+        for token_ids_0, token_ids_1 in zip(indices_0, indices_1):
+            self.assertTrue(torch.equal(token_ids_0[:128], token_ids_1[:128]))

From e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Sun, 17 Dec 2023 18:13:42 +0800
Subject: [PATCH 22/30] Fix the deprecation warning of
 _torch_pytree._register_pytree_node (#27803)

---
 src/transformers/utils/generic.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 34dac8bea70cfc..155b2941f197db 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -306,7 +306,7 @@ def __init_subclass__(cls) -> None:
         `static_graph=True` with modules that output `ModelOutput` subclasses.
         """
         if is_torch_available():
-            _torch_pytree._register_pytree_node(
+            torch_pytree_register_pytree_node(
                 cls,
                 _model_output_flatten,
                 _model_output_unflatten,
@@ -438,7 +438,11 @@ def _model_output_unflatten(values: Iterable[Any], context: "_torch_pytree.Conte
         output_type, keys = context
         return output_type(**dict(zip(keys, values)))
 
-    _torch_pytree._register_pytree_node(
+    if hasattr(_torch_pytree, "register_pytree_node"):
+        torch_pytree_register_pytree_node = _torch_pytree.register_pytree_node
+    else:
+        torch_pytree_register_pytree_node = _torch_pytree._register_pytree_node
+    torch_pytree_register_pytree_node(
         ModelOutput,
         _model_output_flatten,
         _model_output_unflatten,

From b8378b658e9846e647d15a8fd85ad1421326b1e5 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 18 Dec 2023 13:46:30 +0100
Subject: [PATCH 23/30] [`Llava` / `Vip-Llava`] Add SDPA into llava (#28107)

add SDPA into llava
---
 src/transformers/models/llava/modeling_llava.py       | 8 ++++++++
 src/transformers/models/vipllava/modeling_vipllava.py | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 821462c63c37c7..2e5062bb178c1c 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -155,6 +155,14 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+
 
 LLAVA_INPUTS_DOCSTRING = r"""
     Args:
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 1ccabd754f9084..2037d0527cf527 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -162,6 +162,14 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+
 
 VIPLLAVA_INPUTS_DOCSTRING = r"""
     Args:

From 7f2a8f92e4dad15e66d76aa6aa7c574f3784bb33 Mon Sep 17 00:00:00 2001
From: Aeneas Stankowski <47715864+saeneas@users.noreply.github.com>
Date: Mon, 18 Dec 2023 15:04:05 +0100
Subject: [PATCH 24/30] Spelling correction (#28110)

Update mixtral.md

correct minor typo in overview
---
 docs/source/en/model_doc/mixtral.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
index 9f1fb49f6c835c..719b3bb9ea9ebf 100644
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 Mixtral-8x7B is Mistral AI's second Large Language Model (LLM). 
 
-The Mixtral model was proposed in the by the [Mistral AI](https://mistral.ai/) team.
+The Mixtral model was proposed by the [Mistral AI](https://mistral.ai/) team.
 
 It was introduced in the [Mixtral of Experts blogpost](https://mistral.ai/news/mixtral-of-experts/) with the following introduction:
 

From e6cb8e052a74313c2b2440c43df26303d379df71 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 18 Dec 2023 22:27:05 +0800
Subject: [PATCH 25/30] in peft finetune, only the trainable parameters need to
 be saved (#27825)

to reduce the storage size and also save the time of checkpoint saving while using deepspeed for training

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
---
 src/transformers/trainer.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 9cd0bf0685e6c9..a5fcde1f93f664 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -212,6 +212,10 @@
         from accelerate.utils import DeepSpeedSchedulerWrapper
 
 
+def _is_peft_model(model):
+    return is_peft_available() and isinstance(model, PeftModel)
+
+
 if TYPE_CHECKING:
     import optuna
 
@@ -398,13 +402,12 @@ def __init__(
                     " to `True` to avoid any unexpected behavior such as device placement mismatching."
                 )
 
-        _is_peft_model = is_peft_available() and isinstance(model, PeftModel)
         _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
             model, "_hf_peft_config_loaded", False
         )
 
         # At this stage the model is already loaded
-        if _is_quantized_and_base_model and not _is_peft_model:
+        if _is_quantized_and_base_model and not _is_peft_model(model):
             raise ValueError(
                 "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of"
                 " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft"
@@ -619,7 +622,7 @@ def _activate_neftune(self, model):
         """
         unwrapped_model = unwrap_model(model)
 
-        if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+        if _is_peft_model(unwrapped_model):
             embeddings = unwrapped_model.base_model.model.get_input_embeddings()
         else:
             embeddings = unwrapped_model.get_input_embeddings()
@@ -640,7 +643,7 @@ def _deactivate_neftune(self, model):
 
         unwrapped_model = unwrap_model(model)
 
-        if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+        if _is_peft_model(unwrapped_model):
             embeddings = unwrapped_model.base_model.model.get_input_embeddings()
         else:
             embeddings = unwrapped_model.get_input_embeddings()
@@ -696,7 +699,7 @@ def _set_signature_columns_if_needed(self):
         if self._signature_columns is None:
             # Inspect model forward signature to keep only the arguments it accepts.
             model_to_inspect = self.model
-            if is_peft_available() and isinstance(self.model, PeftModel):
+            if _is_peft_model(self.model):
                 model_to_inspect = self.model.get_base_model()
             signature = inspect.signature(model_to_inspect.forward)
             self._signature_columns = list(signature.parameters.keys())
@@ -2114,7 +2117,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                 self._issue_warnings_after_load(load_result)
 
         # Load adapters following PR # 24096
-        elif is_peft_available() and isinstance(model, PeftModel):
+        elif _is_peft_model(model):
             # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
             if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
                 if os.path.exists(resume_from_checkpoint):
@@ -2177,7 +2180,7 @@ def _load_best_model(self):
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
             else:
-                if is_peft_available() and isinstance(model, PeftModel):
+                if _is_peft_model(model):
                     # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
                     if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
                         if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
@@ -2453,7 +2456,13 @@ def _save_optimizer_and_scheduler(self, output_dir):
         elif self.is_deepspeed_enabled:
             # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
             # config `stage3_gather_16bit_weights_on_model_save` is True
-            self.model_wrapped.save_checkpoint(output_dir)
+            accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
+                inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
+            )
+            if accept_exclude_frozen_parameters and _is_peft_model(self.model):
+                self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
+            else:
+                self.model_wrapped.save_checkpoint(output_dir)
         elif self.is_fsdp_enabled:
             # save fsdp specific ckpt for resuming from ckpt
             save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
@@ -2766,7 +2775,7 @@ def compute_loss(self, model, inputs, return_outputs=False):
 
         if labels is not None:
             unwrapped_model = unwrap_model(model)
-            if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+            if _is_peft_model(unwrapped_model):
                 model_name = unwrapped_model.base_model.model._get_name()
             else:
                 model_name = unwrapped_model._get_name()

From a0522de497fdd5565655b3eefbfaa91cbc2639f3 Mon Sep 17 00:00:00 2001
From: lain <70411813+not-lain@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:08:37 +0100
Subject: [PATCH 26/30] fix ConversationalPipeline docstring (#28091)

---
 src/transformers/pipelines/conversational.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py
index a97223c649e100..04152270379dee 100644
--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -215,12 +215,12 @@ class ConversationalPipeline(Pipeline):
     >>> conversation = Conversation("I'm looking for a movie - what's your favourite one?")
     >>> conversation = chatbot(conversation)
     >>> conversation.messages[-1]["content"]
-    ' I don't really have a favorite movie, but I do like action movies. What about you?'
+    "I don't really have a favorite movie, but I do like action movies. What about you?"
 
     >>> conversation.add_message({"role": "user", "content": "That's interesting, why do you like action movies?"})
     >>> conversation = chatbot(conversation)
     >>> conversation.messages[-1]["content"]
-    ' I think it's just because they're so fast-paced and action-fantastic.'
+    " I think it's just because they're so fast-paced and action-fantastic."
     ```
 
     Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

From 7c5408dade9e7d940bed2959822e6ccbabaa132e Mon Sep 17 00:00:00 2001
From: Daize Dong <113810510+DaizeDong@users.noreply.github.com>
Date: Mon, 18 Dec 2023 23:08:55 +0800
Subject: [PATCH 27/30] Disable jitter noise during evaluation in
 SwitchTransformers (#28077)

* Disable jitter noise during evaluation

* Update outdated configuration information

* Formatting

* Add new line
---
 .../modeling_gptsan_japanese.py                |  2 +-
 .../configuration_switch_transformers.py       | 18 ++++++++----------
 .../modeling_switch_transformers.py            |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index cb0d85722262bc..d9b7003050b11a 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -187,7 +187,7 @@ def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[to
         self.input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(self.dtype)
 
-        if self.jitter_noise > 0:
+        if self.training and self.jitter_noise > 0:
             # Multiply the token inputs by the uniform distribution - adding some noise
             hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
 
diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
index 9d8bfe8ba32904..f90874af4da67a 100644
--- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
@@ -38,7 +38,7 @@ class SwitchTransformersConfig(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 32128):
             Vocabulary size of the SwitchTransformers model. Defines the number of different tokens that can be
             represented by the `inputs_ids` passed when calling [`SwitchTransformersModel`].
-        d_model (`int`, *optional*, defaults to 512):
+        d_model (`int`, *optional*, defaults to 768):
             Size of the encoder layers and the pooler layer.
         d_kv (`int`, *optional*, defaults to 64):
             Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
@@ -50,21 +50,19 @@ class SwitchTransformersConfig(PretrainedConfig):
             Transformer.
         num_layers (`int`, *optional*, defaults to 12):
             Number of dense hidden layers in the Transformer encoder layer.
-        num_sparse_encoder_layers (`int`, *optional*, defaults to 6):
+        num_sparse_encoder_layers (`int`, *optional*, defaults to 3):
             Number of sparse (MoE) dense hidden layers in the Transformer encoder layer.
         num_decoder_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
-        num_sparse_decoder_layers (`int`, *optional*, defaults to 12):
+        num_sparse_decoder_layers (`int`, *optional*, defaults to 3):
             Number of sparse (MoE) dense hidden layers in the Transformer decoder layer.
-        num_heads (`int`, *optional*, defaults to 8):
+        num_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
         num_experts (`int`, *optional*, defaults to 8):
             Number of experts for each SwitchTransformer layer.
-        router_type (`str`, *optional*, defaults to `"tokens_masked"`):
-            Router type - choose between `"tokens_masked", `"tokens_scatter"` and `"experts_masked"`.
-        router_bias (`bool`, *optional*, defaults to `True`):
+        router_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the router.
-        router_jitter_noise (`float`, *optional*, defaults to 0.1):
+        router_jitter_noise (`float`, *optional*, defaults to 0.01):
             Amount of noise to add to the router.
         router_dtype (`str`, *optional*, default to `"float32"`):
             The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
@@ -83,10 +81,10 @@ class SwitchTransformersConfig(PretrainedConfig):
             The z loss factor for the total loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
-        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
+        dense_act_fn (`string`, *optional*, defaults to `"relu"`):
             Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. SwitchTransformersv1.1
             uses the `"gated-gelu"` feed forward projection. Original SwitchTransformers uses `"relu"`.
         add_router_probs (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 3a6d19c3478d98..b123a6de2341e1 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -168,7 +168,7 @@ def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[to
         self.input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(self.dtype)
 
-        if self.jitter_noise > 0:
+        if self.training and self.jitter_noise > 0:
             # Multiply the token inputs by the uniform distribution - adding some noise
             hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
 

From 0695b2421a3d1f728fa60591e3d2562a35c35fa2 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Mon, 18 Dec 2023 16:18:01 +0100
Subject: [PATCH 28/30] Remove warning if `DISABLE_TELEMETRY` is used (#28113)

remove warning if DISABLE_TELEMETRY is used
---
 src/transformers/file_utils.py     | 2 +-
 src/transformers/utils/__init__.py | 2 +-
 src/transformers/utils/hub.py      | 8 +-------
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index d710296fc0f559..0dfcefd9c49cfa 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -18,6 +18,7 @@
 """
 
 from huggingface_hub import get_full_repo_name  # for backward compatibility
+from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
 
 from . import __version__
 
@@ -25,7 +26,6 @@
 from .utils import (
     CLOUDFRONT_DISTRIB_PREFIX,
     CONFIG_NAME,
-    DISABLE_TELEMETRY,
     DUMMY_INPUTS,
     DUMMY_MASK,
     ENV_VARS_TRUE_AND_AUTO_VALUES,
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index e9e2f9e0403987..02959b329fd016 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from huggingface_hub import get_full_repo_name  # for backward compatibility
+from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
 from packaging import version
 
 from .. import __version__
@@ -60,7 +61,6 @@
 )
 from .hub import (
     CLOUDFRONT_DISTRIB_PREFIX,
-    DISABLE_TELEMETRY,
     HF_MODULES_CACHE,
     HUGGINGFACE_CO_PREFIX,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 6ab1670ea37cf3..83ef69b5f37213 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -115,7 +115,6 @@ def is_offline_mode():
 HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(constants.HF_HOME, "modules"))
 TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
 SESSION_ID = uuid4().hex
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", constants.HF_HUB_DISABLE_TELEMETRY) in ENV_VARS_TRUE_VALUES
 
 # Add deprecation warning for old environment variables.
 for key in ("PYTORCH_PRETRAINED_BERT_CACHE", "PYTORCH_TRANSFORMERS_CACHE", "TRANSFORMERS_CACHE"):
@@ -124,11 +123,6 @@ def is_offline_mode():
             f"Using `{key}` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.",
             FutureWarning,
         )
-if os.getenv("DISABLE_TELEMETRY") is not None:
-    warnings.warn(
-        "Using `DISABLE_TELEMETRY` is deprecated and will be removed in v5 of Transformers. Use `HF_HUB_DISABLE_TELEMETRY` instead.",
-        FutureWarning,
-    )
 
 
 S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
@@ -229,7 +223,7 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
         ua += f"; torch/{_torch_version}"
     if is_tf_available():
         ua += f"; tensorflow/{_tf_version}"
-    if DISABLE_TELEMETRY:
+    if constants.HF_HUB_DISABLE_TELEMETRY:
         return ua + "; telemetry/off"
     if is_training_run_on_sagemaker():
         ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())

From 71d47f0ad498b7649f11d3a9cca3cd3585e4341f Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Mon, 18 Dec 2023 15:26:03 +0000
Subject: [PATCH 29/30] More TF fixes (#28081)

* More build_in_name_scope()

* Make sure we set the save spec now we don't do it with dummies anymore

* make fixup
---
 src/transformers/modeling_tf_utils.py            | 1 +
 tests/models/auto/test_modeling_tf_auto.py       | 4 ++--
 tests/models/gpt2/test_modeling_tf_gpt2.py       | 2 +-
 tests/models/whisper/test_modeling_tf_whisper.py | 2 +-
 tests/utils/test_modeling_tf_core.py             | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 8c33aacd9ff890..c2daf1da6db30c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1147,6 +1147,7 @@ def __init__(self, config, *inputs, **kwargs):
         self.config = config
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
+        self._set_save_spec(self.input_signature)
 
     def get_config(self):
         return self.config.to_dict()
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
index 537d48a57e48e5..9c284a78aee56c 100644
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ b/tests/models/auto/test_modeling_tf_auto.py
@@ -211,7 +211,7 @@ def test_from_pretrained_with_tuple_values(self):
         config = copy.deepcopy(model.config)
         config.architectures = ["FunnelBaseModel"]
         model = TFAutoModel.from_config(config)
-        model.build()
+        model.build_in_name_scope()
 
         self.assertIsInstance(model, TFFunnelBaseModel)
 
@@ -249,7 +249,7 @@ def test_new_model_registration(self):
                     config = NewModelConfig(**tiny_config.to_dict())
 
                     model = auto_class.from_config(config)
-                    model.build()
+                    model.build_in_name_scope()
 
                     self.assertIsInstance(model, TFNewModel)
 
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index a88435acba3e63..d636097dc28622 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -445,7 +445,7 @@ def test_onnx_runtime_optimize(self):
                 continue
 
             model = model_class(config)
-            model.build()
+            model.build_in_name_scope()
 
             onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index e7ac93a5adbe6a..9d1a3299349a94 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -312,7 +312,7 @@ def test_encoder_sinusoidal_embed_positions(self):
         config = self.model_tester.get_config()
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model.build()
+            model.build_in_name_scope()
 
             embeds = model.get_encoder().embed_positions.get_weights()[0]
             sinusoids = sinusoidal_embedding_init(embeds.shape).numpy()
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index ebd5dfda6bc019..7ec2198dd1523f 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -217,7 +217,7 @@ def test_saved_model_creation_extended(self):
         for model_class in self.all_model_classes[:2]:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
-            model.build()
+            model.build_in_name_scope()
             num_out = len(model(class_inputs_dict))
 
             for key in list(class_inputs_dict.keys()):

From 08a6e7a702d06826659eb7f0f6b9f37d33f31829 Mon Sep 17 00:00:00 2001
From: Rockerz <rajveer.rathod1301@gmail.com>
Date: Mon, 18 Dec 2023 23:17:54 +0530
Subject: [PATCH 30/30] Fix indentation error - semantic_segmentation.md
 (#28117)

Update semantic_segmentation.md
---
 docs/source/en/tasks/semantic_segmentation.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index f422fef9aeb566..e99499bbbbd4cd 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -276,8 +276,7 @@ You could also create and use your own dataset if you prefer to train with the [
                                      "label": sorted(label_paths)})
          dataset = dataset.cast_column("image", Image())
          dataset = dataset.cast_column("label", Image())
-
-     return dataset
+         return dataset
 
      # step 1: create Dataset objects
      train_dataset = create_dataset(image_paths_train, label_paths_train)