From 7c5b746ca70d86381275401f9a0e8ce6008f3e78 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 11 Apr 2024 11:29:06 -0400
Subject: [PATCH 1/8] Alias

---
 src/transformers/training_args.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index cdf6325c4b4ae6..9df0b563b41da9 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -18,7 +18,7 @@
 import math
 import os
 import warnings
-from dataclasses import asdict, dataclass, field, fields
+from dataclasses import InitVar, asdict, dataclass, field, fields
 from datetime import timedelta
 from enum import Enum
 from pathlib import Path
@@ -203,7 +203,7 @@ class TrainingArguments:
             Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
             intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+        evaluation_strategy (or eval_strategy) (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
                 - `"no"`: No evaluation is done during training.
@@ -737,6 +737,10 @@ class TrainingArguments:
         default="no",
         metadata={"help": "The evaluation strategy to use."},
     )
+    eval_strategy: InitVar[Union[IntervalStrategy, str]] = field(
+        default=None,
+        metadata={"help": "The evaluation strategy to use."},
+    )
     prediction_loss_only: bool = field(
         default=False,
         metadata={"help": "When performing evaluation and predictions, only returns the loss."},
@@ -1393,6 +1397,10 @@ def __post_init__(self):
         if self.disable_tqdm is None:
             self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
 
+        # Deal with alias
+        if self.eval_strategy is not None:
+            self.evaluation_strategy = self.eval_strategy
+
         if isinstance(self.evaluation_strategy, EvaluationStrategy):
             warnings.warn(
                 "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5"

From 6da2f15970b0d52869678ecafc63c4b9a6562274 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 11 Apr 2024 11:31:32 -0400
Subject: [PATCH 2/8] Note alias

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9df0b563b41da9..d03dac9fc0e974 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -739,7 +739,7 @@ class TrainingArguments:
     )
     eval_strategy: InitVar[Union[IntervalStrategy, str]] = field(
         default=None,
-        metadata={"help": "The evaluation strategy to use."},
+        metadata={"help": "Alias for `evaluation_strategy`."},
     )
     prediction_loss_only: bool = field(
         default=False,

From 068c11314dee842f82396bdddd13d7619c90d7ba Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 16 Apr 2024 11:42:26 -0400
Subject: [PATCH 3/8] Tests and src

---
 .../integrations/integration_utils.py         |  4 +-
 src/transformers/trainer_callback.py          |  6 +-
 src/transformers/training_args.py             | 63 ++++++++++---------
 src/transformers/training_args_tf.py          |  4 +-
 src/transformers/utils/notebook.py            |  6 +-
 tests/deepspeed/test_deepspeed.py             |  2 +-
 tests/extended/test_trainer_ext.py            |  2 +-
 tests/fsdp/test_fsdp.py                       |  2 +-
 tests/trainer/test_trainer.py                 | 32 +++++-----
 tests/trainer/test_trainer_callback.py        | 10 +--
 10 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 00074a9574b548..63b9e050d4a1d3 100644
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -320,13 +320,13 @@ def _objective(trial: dict, local_trainer):
         # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
         if isinstance(
             kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
-        ) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == IntervalStrategy.NO):
+        ) and (not trainer.args.do_eval or trainer.args.eval_strategy == IntervalStrategy.NO):
             raise RuntimeError(
                 "You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
                 "This means your trials will not report intermediate results to Ray Tune, and "
                 "can thus not be stopped early or used to exploit other trials parameters. "
                 "If this is what you want, do not use {cls}. If you would like to use {cls}, "
-                "make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
+                "make sure you pass `do_eval=True` and `eval_strategy='steps'` in the "
                 "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
             )
 
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 1e3b0e587a74c6..b8e9aaf6fe55ec 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -443,7 +443,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
 
         # Evaluate
         if (
-            args.evaluation_strategy == IntervalStrategy.STEPS
+            args.eval_strategy == IntervalStrategy.STEPS
             and state.global_step % state.eval_steps == 0
             and args.eval_delay <= state.global_step
         ):
@@ -469,7 +469,7 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr
             control.should_log = True
 
         # Evaluate
-        if args.evaluation_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch:
+        if args.eval_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch:
             control.should_evaluate = True
 
         # Save
@@ -580,7 +580,7 @@ def on_train_begin(self, args, state, control, **kwargs):
             args.metric_for_best_model is not None
         ), "EarlyStoppingCallback requires metric_for_best_model is defined"
         assert (
-            args.evaluation_strategy != IntervalStrategy.NO
+            args.eval_strategy != IntervalStrategy.NO
         ), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
 
     def on_evaluate(self, args, state, control, metrics, **kwargs):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index d03dac9fc0e974..5da606f34a2b58 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -195,7 +195,7 @@ class TrainingArguments:
             by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
         do_eval (`bool`, *optional*):
-            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
+            Whether to run evaluation on the validation set or not. Will be set to `True` if `eval_strategy` is
             different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
             training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
@@ -203,7 +203,7 @@ class TrainingArguments:
             Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
             intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        evaluation_strategy (or eval_strategy) (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+        eval_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
                 - `"no"`: No evaluation is done during training.
@@ -232,7 +232,7 @@ class TrainingArguments:
             requires more memory).
         eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
-            evaluation_strategy.
+            eval_strategy.
         learning_rate (`float`, *optional*, defaults to 5e-5):
             The initial learning rate for [`AdamW`] optimizer.
         weight_decay (`float`, *optional*, defaults to 0):
@@ -375,7 +375,7 @@ class TrainingArguments:
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
         eval_steps (`int` or `float`, *optional*):
-            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same
             value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
             will be interpreted as ratio of total training steps.
         dataloader_num_workers (`int`, *optional*, defaults to 0):
@@ -409,7 +409,7 @@ class TrainingArguments:
 
             <Tip>
 
-            When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+            When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in
             the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
 
             </Tip>
@@ -733,13 +733,9 @@ class TrainingArguments:
     do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
     do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
     do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
-    evaluation_strategy: Union[IntervalStrategy, str] = field(
-        default="no",
-        metadata={"help": "The evaluation strategy to use."},
-    )
-    eval_strategy: InitVar[Union[IntervalStrategy, str]] = field(
+    eval_strategy: Union[IntervalStrategy, str] = field(
         default=None,
-        metadata={"help": "Alias for `evaluation_strategy`."},
+        metadata={"help": "The evaluation strategy to use."},
     )
     prediction_loss_only: bool = field(
         default=False,
@@ -786,7 +782,7 @@ class TrainingArguments:
         metadata={
             "help": (
                 "Number of epochs or steps to wait for before the first evaluation can be performed, depending on the"
-                " evaluation_strategy."
+                " eval_strategy."
             )
         },
     )
@@ -1273,6 +1269,10 @@ class TrainingArguments:
             "choices": ["auto", "apex", "cpu_amp"],
         },
     )
+    evaluation_strategy: Union[IntervalStrategy, str] = field(
+        default=None,
+        metadata={"help": "Deprecated. Use `eval_strategy` instead"},
+    )
     push_to_hub_model_id: Optional[str] = field(
         default=None, metadata={"help": "The name of the repository to which push the `Trainer`."}
     )
@@ -1397,18 +1397,21 @@ def __post_init__(self):
         if self.disable_tqdm is None:
             self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
 
-        # Deal with alias
-        if self.eval_strategy is not None:
-            self.evaluation_strategy = self.eval_strategy
+        if self.evaluation_strategy is not None:
+            warnings.warn(
+                "`evaluation_strategy` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use `eval_strategy` instead",
+                FutureWarning,
+            )
+            self.eval_strategy = self.evaluation_strategy
 
-        if isinstance(self.evaluation_strategy, EvaluationStrategy):
+        if isinstance(self.eval_strategy, EvaluationStrategy):
             warnings.warn(
                 "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5"
                 " of 🤗 Transformers. Use `IntervalStrategy` instead",
                 FutureWarning,
             )
             # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
-            self.evaluation_strategy = self.evaluation_strategy.value
+            self.eval_strategy = self.eval_strategy.value
         if self.no_cuda:
             warnings.warn(
                 "using `no_cuda` is deprecated and will be removed in version 5.0 of 🤗 Transformers. "
@@ -1417,23 +1420,23 @@ def __post_init__(self):
             )
             self.use_cpu = self.no_cuda
 
-        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+        self.eval_strategy = IntervalStrategy(self.eval_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
         self.save_strategy = IntervalStrategy(self.save_strategy)
         self.hub_strategy = HubStrategy(self.hub_strategy)
 
         self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
-        if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
+        if self.do_eval is False and self.eval_strategy != IntervalStrategy.NO:
             self.do_eval = True
 
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
-        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+        if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
             if self.logging_steps > 0:
                 logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}")
                 self.eval_steps = self.logging_steps
             else:
                 raise ValueError(
-                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or"
+                    f"evaluation strategy {self.eval_strategy} requires either non-zero --eval_steps or"
                     " --logging_steps"
                 )
 
@@ -1445,7 +1448,7 @@ def __post_init__(self):
             if self.logging_steps != int(self.logging_steps):
                 raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
             self.logging_steps = int(self.logging_steps)
-        if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
+        if self.eval_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
             if self.eval_steps != int(self.eval_steps):
                 raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
             self.eval_steps = int(self.eval_steps)
@@ -1456,12 +1459,12 @@ def __post_init__(self):
 
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
         if self.load_best_model_at_end:
-            if self.evaluation_strategy != self.save_strategy:
+            if self.eval_strategy != self.save_strategy:
                 raise ValueError(
                     "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
-                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                    f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}"
                 )
-            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+            if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
                 if self.eval_steps < 1 or self.save_steps < 1:
                     if not (self.eval_steps < 1 and self.save_steps < 1):
                         raise ValueError(
@@ -1539,7 +1542,7 @@ def __post_init__(self):
                 raise ValueError(" `--half_precision_backend apex`: GPU bf16 is not supported by apex.")
 
         if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU:
-            if self.evaluation_strategy == IntervalStrategy.NO:
+            if self.eval_strategy == IntervalStrategy.NO:
                 raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")
             if not is_torch_available():
                 raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0")
@@ -2396,7 +2399,7 @@ def set_evaluate(
                 but requires more memory).
             delay (`float`, *optional*):
                 Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
-                evaluation_strategy.
+                eval_strategy.
             loss_only (`bool`, *optional*, defaults to `False`):
                 Ignores all outputs except the loss.
             jit_mode (`bool`, *optional*):
@@ -2413,10 +2416,10 @@ def set_evaluate(
         100
         ```
         """
-        self.evaluation_strategy = IntervalStrategy(strategy)
-        if self.evaluation_strategy == IntervalStrategy.STEPS and steps == 0:
+        self.eval_strategy = IntervalStrategy(strategy)
+        if self.eval_strategy == IntervalStrategy.STEPS and steps == 0:
             raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
-        self.do_eval = self.evaluation_strategy != IntervalStrategy.NO
+        self.do_eval = self.eval_strategy != IntervalStrategy.NO
         self.eval_steps = steps
         self.per_device_eval_batch_size = batch_size
         self.eval_accumulation_steps = accumulation_steps
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 4498f4cb793b92..12a6c5afe926bf 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -49,7 +49,7 @@ class TFTrainingArguments(TrainingArguments):
             by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
         do_eval (`bool`, *optional*):
-            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
+            Whether to run evaluation on the validation set or not. Will be set to `True` if `eval_strategy` is
             different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
             training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
@@ -57,7 +57,7 @@ class TFTrainingArguments(TrainingArguments):
             Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
             intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+        eval_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
                 - `"no"`: No evaluation is done during training.
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index f7396642732e58..9704aca242a60f 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -292,11 +292,11 @@ def __init__(self):
         self._force_next_update = False
 
     def on_train_begin(self, args, state, control, **kwargs):
-        self.first_column = "Epoch" if args.evaluation_strategy == IntervalStrategy.EPOCH else "Step"
+        self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
         self.training_loss = 0
         self.last_log = 0
         column_names = [self.first_column] + ["Training Loss"]
-        if args.evaluation_strategy != IntervalStrategy.NO:
+        if args.eval_strategy != IntervalStrategy.NO:
             column_names.append("Validation Loss")
         self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names)
 
@@ -328,7 +328,7 @@ def on_predict(self, args, state, control, **kwargs):
 
     def on_log(self, args, state, control, logs=None, **kwargs):
         # Only for when there is no evaluation
-        if args.evaluation_strategy == IntervalStrategy.NO and "loss" in logs:
+        if args.eval_strategy == IntervalStrategy.NO and "loss" in logs:
             values = {"Training Loss": logs["loss"]}
             # First column is necessarily Step sine we're not in epoch eval strategy
             values["Step"] = state.global_step
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 81308d32c6cf22..f9b74783600b5a 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -959,7 +959,7 @@ def test_load_best_model(self, stage, dtype):
                 "do_train": True,
                 "do_eval": True,
                 "optim": "adafactor",
-                "evaluation_strategy": "steps",
+                "eval_strategy": "steps",
                 "eval_steps": 1,
                 "save_strategy": "steps",
                 "save_steps": 1,
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 5c33eb2d9ed750..4bda892162fdad 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -308,7 +308,7 @@ def run_trainer(
             --per_device_eval_batch_size 4
             --max_eval_samples 8
             --val_max_target_length {max_len}
-            --evaluation_strategy steps
+            --eval_strategy steps
             --eval_steps {str(eval_steps)}
         """.split()
 
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index aeb232fd9e8e0b..71293f1601adde 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -308,6 +308,6 @@ def get_base_args(self, output_dir, num_epochs, logging_steps):
             --logging_steps {logging_steps}
             --save_strategy epoch
             --do_eval
-            --evaluation_strategy epoch
+            --eval_strategy epoch
             --report_to none
         """
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 483022a09e6fab..2cbb59f235fe30 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -740,7 +740,7 @@ def test_reduce_lr_on_plateau_args(self):
         eval_dataset = RegressionDataset(length=64)
         args = TrainingArguments(
             "./regression",
-            evaluation_strategy="epoch",
+            eval_strategy="epoch",
             metric_for_best_model="eval_loss",
         )
         model = RegressionModel()
@@ -772,7 +772,7 @@ def log(self, logs):
         args = TrainingArguments(
             "./regression",
             lr_scheduler_type="reduce_lr_on_plateau",
-            evaluation_strategy="epoch",
+            eval_strategy="epoch",
             metric_for_best_model="eval_loss",
             num_train_epochs=10,
             learning_rate=0.2,
@@ -2210,7 +2210,7 @@ def test_load_best_model_at_end(self):
                 output_dir=tmpdir,
                 learning_rate=0.1,
                 eval_steps=5,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 save_steps=5,
                 load_best_model_at_end=True,
             )
@@ -2226,7 +2226,7 @@ def test_load_best_model_at_end(self):
                 output_dir=tmpdir,
                 learning_rate=0.1,
                 eval_steps=5,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 save_steps=5,
                 load_best_model_at_end=True,
                 metric_for_best_model="accuracy",
@@ -2243,7 +2243,7 @@ def test_load_best_model_at_end(self):
                 b=2.5,
                 output_dir=tmpdir,
                 learning_rate=0.1,
-                evaluation_strategy="epoch",
+                eval_strategy="epoch",
                 save_strategy="epoch",
                 load_best_model_at_end=True,
                 metric_for_best_model="accuracy",
@@ -2262,7 +2262,7 @@ def test_load_best_model_at_end(self):
                 output_dir=tmpdir,
                 learning_rate=0.1,
                 eval_steps=5,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 save_steps=5,
                 load_best_model_at_end=True,
                 pretrained=False,
@@ -2283,7 +2283,7 @@ def test_load_best_model_from_safetensors(self):
                     output_dir=tmpdir,
                     learning_rate=0.1,
                     eval_steps=5,
-                    evaluation_strategy="steps",
+                    eval_strategy="steps",
                     save_steps=5,
                     load_best_model_at_end=True,
                     save_safetensors=save_safetensors,
@@ -2437,7 +2437,7 @@ def test_early_stopping_callback(self):
                 gradient_accumulation_steps=1,
                 per_device_train_batch_size=16,
                 load_best_model_at_end=True,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 compute_metrics=AlmostAccuracy(),
                 metric_for_best_model="accuracy",
@@ -2453,7 +2453,7 @@ def test_early_stopping_callback(self):
                 num_train_epochs=20,
                 gradient_accumulation_steps=1,
                 per_device_train_batch_size=16,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 compute_metrics=AlmostAccuracy(),
                 metric_for_best_model="accuracy",
             )
@@ -2497,7 +2497,7 @@ def test_checkpoint_rotation(self):
 
             # With best model at end
             trainer = get_regression_trainer(
-                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=2
+                output_dir=tmp_dir, eval_strategy="steps", load_best_model_at_end=True, save_total_limit=2
             )
             trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
             self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
@@ -2505,7 +2505,7 @@ def test_checkpoint_rotation(self):
             # Edge case: we don't always honor save_total_limit=1 if load_best_model_at_end=True to be able to resume
             # from checkpoint
             trainer = get_regression_trainer(
-                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=1
+                output_dir=tmp_dir, eval_strategy="steps", load_best_model_at_end=True, save_total_limit=1
             )
             trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25")
             self.check_checkpoint_deletion(trainer, tmp_dir, [25])
@@ -3312,7 +3312,7 @@ def hp_name(trial):
                 output_dir=tmp_dir,
                 learning_rate=0.1,
                 logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
@@ -3361,7 +3361,7 @@ def compute_objective(metrics: Dict[str, float]) -> List[float]:
                 output_dir=tmp_dir,
                 learning_rate=0.1,
                 logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=10,
                 disable_tqdm=True,
@@ -3419,7 +3419,7 @@ def hp_name(params):
                 output_dir=tmp_dir,
                 learning_rate=0.1,
                 logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
@@ -3482,7 +3482,7 @@ def hp_name(trial):
                 output_dir=tmp_dir,
                 learning_rate=0.1,
                 logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
@@ -3902,7 +3902,7 @@ def hp_name(params):
                 output_dir=tmp_dir,
                 learning_rate=0.1,
                 logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
+                eval_strategy=IntervalStrategy.EPOCH,
                 save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index 8e851132c2daab..b712edca385c25 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -133,12 +133,12 @@ def get_expected_events(self, trainer):
                 expected_events += ["on_step_begin", "on_step_end"]
                 if step % trainer.args.logging_steps == 0:
                     expected_events.append("on_log")
-                if trainer.args.evaluation_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0:
+                if trainer.args.eval_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0:
                     expected_events += evaluation_events.copy()
                 if step % trainer.args.save_steps == 0:
                     expected_events.append("on_save")
             expected_events.append("on_epoch_end")
-            if trainer.args.evaluation_strategy == IntervalStrategy.EPOCH:
+            if trainer.args.eval_strategy == IntervalStrategy.EPOCH:
                 expected_events += evaluation_events.copy()
         expected_events += ["on_log", "on_train_end"]
         return expected_events
@@ -215,12 +215,12 @@ def test_event_flow(self):
         events = trainer.callback_handler.callbacks[-2].events
         self.assertEqual(events, self.get_expected_events(trainer))
 
-        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], eval_steps=5, evaluation_strategy="steps")
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], eval_steps=5, eval_strategy="steps")
         trainer.train()
         events = trainer.callback_handler.callbacks[-2].events
         self.assertEqual(events, self.get_expected_events(trainer))
 
-        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], evaluation_strategy="epoch")
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], eval_strategy="epoch")
         trainer.train()
         events = trainer.callback_handler.callbacks[-2].events
         self.assertEqual(events, self.get_expected_events(trainer))
@@ -231,7 +231,7 @@ def test_event_flow(self):
             logging_steps=3,
             save_steps=10,
             eval_steps=5,
-            evaluation_strategy="steps",
+            eval_strategy="steps",
         )
         trainer.train()
         events = trainer.callback_handler.callbacks[-2].events

From 959a6794d1363b6758fcd3a94379e8220b8bd1c0 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 16 Apr 2024 11:43:45 -0400
Subject: [PATCH 4/8] Rest

---
 docs/source/de/training.md                     |  4 ++--
 docs/source/en/model_memory_anatomy.md         |  2 +-
 docs/source/en/tasks/asr.md                    |  2 +-
 docs/source/en/tasks/audio_classification.md   |  2 +-
 .../en/tasks/document_question_answering.md    |  2 +-
 docs/source/en/tasks/image_captioning.md       |  2 +-
 docs/source/en/tasks/image_classification.md   |  2 +-
 ...ge_distillation_for_image_classification.md |  2 +-
 docs/source/en/tasks/language_modeling.md      |  2 +-
 .../en/tasks/masked_language_modeling.md       |  2 +-
 docs/source/en/tasks/multiple_choice.md        |  2 +-
 docs/source/en/tasks/question_answering.md     |  2 +-
 docs/source/en/tasks/semantic_segmentation.md  |  2 +-
 .../source/en/tasks/sequence_classification.md |  2 +-
 docs/source/en/tasks/summarization.md          |  2 +-
 docs/source/en/tasks/text-to-speech.md         |  2 +-
 docs/source/en/tasks/token_classification.md   |  2 +-
 docs/source/en/tasks/translation.md            |  2 +-
 docs/source/en/tasks/video_classification.md   |  2 +-
 docs/source/en/trainer.md                      |  2 +-
 docs/source/en/training.md                     |  4 ++--
 docs/source/es/tasks/asr.md                    |  2 +-
 docs/source/es/tasks/image_captioning.md       |  2 +-
 docs/source/es/tasks/image_classification.md   |  2 +-
 docs/source/es/tasks/language_modeling.md      |  4 ++--
 docs/source/es/tasks/multiple_choice.md        |  2 +-
 docs/source/es/tasks/question_answering.md     |  2 +-
 docs/source/es/tasks/summarization.md          |  2 +-
 docs/source/es/trainer.md                      |  2 +-
 docs/source/es/training.md                     |  4 ++--
 docs/source/it/migration.md                    |  2 +-
 docs/source/it/training.md                     |  4 ++--
 docs/source/ja/model_memory_anatomy.md         |  2 +-
 docs/source/ja/tasks/asr.md                    |  2 +-
 docs/source/ja/tasks/audio_classification.md   |  2 +-
 .../ja/tasks/document_question_answering.md    |  2 +-
 docs/source/ja/tasks/image_captioning.md       |  2 +-
 docs/source/ja/tasks/image_classification.md   |  2 +-
 ...ge_distillation_for_image_classification.md |  2 +-
 docs/source/ja/tasks/language_modeling.md      |  2 +-
 .../ja/tasks/masked_language_modeling.md       |  2 +-
 docs/source/ja/tasks/multiple_choice.md        |  2 +-
 docs/source/ja/tasks/question_answering.md     |  2 +-
 docs/source/ja/tasks/semantic_segmentation.md  |  2 +-
 .../source/ja/tasks/sequence_classification.md |  2 +-
 docs/source/ja/tasks/summarization.md          |  2 +-
 docs/source/ja/tasks/text-to-speech.md         |  2 +-
 docs/source/ja/tasks/token_classification.md   |  2 +-
 docs/source/ja/tasks/translation.md            |  2 +-
 docs/source/ja/tasks/video_classification.md   |  2 +-
 docs/source/ja/training.md                     |  4 ++--
 docs/source/ko/model_memory_anatomy.md         |  2 +-
 docs/source/ko/tasks/asr.md                    |  2 +-
 docs/source/ko/tasks/audio_classification.md   |  2 +-
 .../ko/tasks/document_question_answering.md    |  2 +-
 docs/source/ko/tasks/image_captioning.md       |  2 +-
 docs/source/ko/tasks/image_classification.md   |  2 +-
 docs/source/ko/tasks/language_modeling.md      |  2 +-
 .../ko/tasks/masked_language_modeling.md       |  2 +-
 docs/source/ko/tasks/multiple_choice.md        |  2 +-
 docs/source/ko/tasks/question_answering.md     |  2 +-
 docs/source/ko/tasks/semantic_segmentation.md  |  2 +-
 .../source/ko/tasks/sequence_classification.md |  2 +-
 docs/source/ko/tasks/summarization.md          |  2 +-
 docs/source/ko/tasks/token_classification.md   |  2 +-
 docs/source/ko/tasks/translation.md            |  2 +-
 docs/source/ko/tasks/video_classification.md   |  2 +-
 docs/source/ko/training.md                     |  4 ++--
 docs/source/pt/tasks/token_classification.md   |  2 +-
 docs/source/pt/training.md                     |  4 ++--
 docs/source/zh/tasks/asr.md                    |  2 +-
 docs/source/zh/training.md                     |  4 ++--
 examples/flax/language-modeling/README.md      |  4 ++--
 examples/legacy/seq2seq/finetune.sh            |  2 +-
 examples/legacy/seq2seq/finetune_tpu.sh        |  2 +-
 examples/legacy/seq2seq/finetune_trainer.py    |  2 +-
 .../legacy/seq2seq/train_distil_marian_enro.sh |  2 +-
 .../seq2seq/train_distil_marian_enro_tpu.sh    |  2 +-
 .../legacy/seq2seq/train_distilbart_cnn.sh     |  2 +-
 .../legacy/seq2seq/train_mbart_cc25_enro.sh    |  2 +-
 examples/pytorch/README.md                     |  2 +-
 .../pytorch/audio-classification/README.md     |  4 ++--
 .../pytorch/image-classification/README.md     |  2 +-
 examples/pytorch/image-pretraining/README.md   |  6 +++---
 .../pytorch/semantic-segmentation/README.md    |  2 +-
 examples/pytorch/speech-recognition/README.md  | 18 +++++++++---------
 .../examples/train_complexity_predictor.py     |  2 +-
 .../research_projects/layoutlmv3/README.md     |  4 ++--
 .../robust-speech-event/README.md              |  4 ++--
 .../README.md                                  |  2 +-
 .../finetuning.py                              | 10 +++++-----
 .../self-training-text-classification/run.sh   |  2 +-
 .../selftraining.py                            |  8 ++++----
 examples/research_projects/tapex/README.md     | 12 ++++++------
 .../wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md        |  4 ++--
 examples/research_projects/wav2vec2/README.md  |  8 ++++----
 .../wav2vec2/finetune_base_100.sh              |  2 +-
 .../wav2vec2/finetune_base_timit_asr.sh        |  2 +-
 .../wav2vec2/finetune_large_lv60_100.sh        |  2 +-
 .../wav2vec2/finetune_large_lv60_timit_asr.sh  |  2 +-
 ...etune_large_xlsr_53_arabic_speech_corpus.sh |  2 +-
 .../wav2vec2/finetune_wav2vec2_xlsr_turkish.sh |  2 +-
 .../wav2vec2/test_wav2vec2_deepspeed.py        |  2 +-
 examples/research_projects/xtreme-s/README.md  |  4 ++--
 .../tensorflow/image-classification/README.md  |  2 +-
 tests/trainer/test_trainer_seq2seq.py          |  2 +-
 106 files changed, 146 insertions(+), 146 deletions(-)

diff --git a/docs/source/de/training.md b/docs/source/de/training.md
index 7b1bd3e5d0c368..806a380b6cebc9 100644
--- a/docs/source/de/training.md
+++ b/docs/source/de/training.md
@@ -128,12 +128,12 @@ Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhe
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `evaluation_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln:
+Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `eval_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln:
 
 ```py
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
index c820681a7af0fc..1fc7b495932aff 100644
--- a/docs/source/en/model_memory_anatomy.md
+++ b/docs/source/en/model_memory_anatomy.md
@@ -145,7 +145,7 @@ arguments:
 ```py
 default_args = {
     "output_dir": "tmp",
-    "evaluation_strategy": "steps",
+    "eval_strategy": "steps",
     "num_train_epochs": 1,
     "log_level": "error",
     "report_to": "none",
diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md
index 737460ed297bcf..a1a96271102ba4 100644
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@@ -270,7 +270,7 @@ At this point, only three steps remain:
 ...     gradient_checkpointing=True,
 ...     fp16=True,
 ...     group_by_length=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=8,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md
index 678af90c4fa079..5ea3567f4c3c6c 100644
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@@ -221,7 +221,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_mind_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=3e-5,
 ...     per_device_train_batch_size=32,
diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md
index 24bf3a069ac9a5..3d3acf0541dbf9 100644
--- a/docs/source/en/tasks/document_question_answering.md
+++ b/docs/source/en/tasks/document_question_answering.md
@@ -399,7 +399,7 @@ In this case the `output_dir` will also be the name of the repo where your model
 ...     num_train_epochs=20,
 ...     save_steps=200,
 ...     logging_steps=50,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     learning_rate=5e-5,
 ...     save_total_limit=2,
 ...     remove_unused_columns=False,
diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md
index b426cbf6383187..633ccc491ebb35 100644
--- a/docs/source/en/tasks/image_captioning.md
+++ b/docs/source/en/tasks/image_captioning.md
@@ -196,7 +196,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=32,
     gradient_accumulation_steps=2,
     save_total_limit=3,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=50,
     save_strategy="steps",
     save_steps=50,
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 30c517f3be6499..25f232bc00a728 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -302,7 +302,7 @@ At this point, only three steps remain:
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_food_model",
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=16,
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index 8448e53011494c..f856e35b1740bd 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -112,7 +112,7 @@ training_args = TrainingArguments(
     fp16=True,
     logging_dir=f"{repo_name}/logs",
     logging_strategy="epoch",
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
     save_strategy="epoch",
     load_best_model_at_end=True,
     metric_for_best_model="accuracy",
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 88a45192713912..380c089d059f34 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -249,7 +249,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_clm-model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     weight_decay=0.01,
 ...     push_to_hub=True,
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index de91cd587a6a0c..1736e858eeb36e 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -238,7 +238,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_mlm_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     num_train_epochs=3,
 ...     weight_decay=0.01,
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 5cf17448f0a66a..9baa0eea5d5934 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -265,7 +265,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_swag_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     learning_rate=5e-5,
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 2c4706ad93b001..724e51d0dc9f5d 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -218,7 +218,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_qa_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index 675f9222cafd21..048a1d38d003b6 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -535,7 +535,7 @@ At this point, only three steps remain:
 ...     per_device_train_batch_size=2,
 ...     per_device_eval_batch_size=2,
 ...     save_total_limit=3,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     save_strategy="steps",
 ...     save_steps=20,
 ...     eval_steps=20,
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index 572bc6a8b82418..67a792b43edac7 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -187,7 +187,7 @@ At this point, only three steps remain:
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=2,
 ...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     push_to_hub=True,
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index 28dd3f5a49ebe3..37a305a4ac008e 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -202,7 +202,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_billsum_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md
index 0b324904e9e263..494e20009529ce 100644
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -477,7 +477,7 @@ only look at the loss:
 ...     max_steps=4000,
 ...     gradient_checkpointing=True,
 ...     fp16=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=2,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 791737b677c871..d0e4e87963f9b1 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -290,7 +290,7 @@ At this point, only three steps remain:
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=2,
 ...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     push_to_hub=True,
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index f0433a0dad797d..172b06e546f8a7 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -209,7 +209,7 @@ At this point, only three steps remain:
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_opus_books_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
index 38bdceba41b7b4..1a0b8deeb1d34a 100644
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@@ -354,7 +354,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor
 >>> args = TrainingArguments(
 ...     new_model_name,
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 3d57220fe827d9..b69bebd6ea2004 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -62,7 +62,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=16,
     num_train_epochs=2,
     weight_decay=0.01,
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
     save_strategy="epoch",
     load_best_model_at_end=True,
     push_to_hub=True,
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
index 4bd72aa9f6384d..cea583c05ebc7d 100644
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@@ -128,12 +128,12 @@ Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predict
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:
+If you'd like to monitor your evaluation metrics during fine-tuning, specify the `eval_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:
 
 ```py
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/es/tasks/asr.md b/docs/source/es/tasks/asr.md
index 850bdfd711e7e0..7d3133af472f64 100644
--- a/docs/source/es/tasks/asr.md
+++ b/docs/source/es/tasks/asr.md
@@ -260,7 +260,7 @@ En este punto, solo quedan tres pasos:
 ...     gradient_checkpointing=True,
 ...     fp16=True,
 ...     group_by_length=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=8,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/es/tasks/image_captioning.md b/docs/source/es/tasks/image_captioning.md
index f06f6eda0a7576..620dcec1bfbd1c 100644
--- a/docs/source/es/tasks/image_captioning.md
+++ b/docs/source/es/tasks/image_captioning.md
@@ -188,7 +188,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=32,
     gradient_accumulation_steps=2,
     save_total_limit=3,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=50,
     save_strategy="steps",
     save_steps=50,
diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
index f09730caf69fee..4e3696c505b030 100644
--- a/docs/source/es/tasks/image_classification.md
+++ b/docs/source/es/tasks/image_classification.md
@@ -143,7 +143,7 @@ Al llegar a este punto, solo quedan tres pasos:
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
 ...     per_device_train_batch_size=16,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     num_train_epochs=4,
 ...     fp16=True,
 ...     save_steps=100,
diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md
index 010d1bccae7bbf..73bfc4d650f131 100644
--- a/docs/source/es/tasks/language_modeling.md
+++ b/docs/source/es/tasks/language_modeling.md
@@ -232,7 +232,7 @@ A este punto, solo faltan tres pasos:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     weight_decay=0.01,
 ... )
@@ -338,7 +338,7 @@ A este punto, solo faltan tres pasos:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     num_train_epochs=3,
 ...     weight_decay=0.01,
diff --git a/docs/source/es/tasks/multiple_choice.md b/docs/source/es/tasks/multiple_choice.md
index ca2e3d15f63546..959416f149c357 100644
--- a/docs/source/es/tasks/multiple_choice.md
+++ b/docs/source/es/tasks/multiple_choice.md
@@ -212,7 +212,7 @@ En este punto, solo quedan tres pasos:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/es/tasks/question_answering.md b/docs/source/es/tasks/question_answering.md
index 5cd59f6b064f71..ca43aac9ae9e7a 100644
--- a/docs/source/es/tasks/question_answering.md
+++ b/docs/source/es/tasks/question_answering.md
@@ -182,7 +182,7 @@ En este punto, solo quedan tres pasos:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/es/tasks/summarization.md b/docs/source/es/tasks/summarization.md
index 19ceb90b22cbb2..e6a9532f660387 100644
--- a/docs/source/es/tasks/summarization.md
+++ b/docs/source/es/tasks/summarization.md
@@ -140,7 +140,7 @@ En este punto, solo faltan tres pasos:
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/es/trainer.md b/docs/source/es/trainer.md
index 9a36e3867c17e3..57fcaa62900572 100644
--- a/docs/source/es/trainer.md
+++ b/docs/source/es/trainer.md
@@ -60,7 +60,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=16,
     num_train_epochs=2,
     weight_decay=0.01,
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
     save_strategy="epoch",
     load_best_model_at_end=True,
     push_to_hub=True,
diff --git a/docs/source/es/training.md b/docs/source/es/training.md
index fef44ed3f9ff72..f10f49d08997ac 100644
--- a/docs/source/es/training.md
+++ b/docs/source/es/training.md
@@ -120,12 +120,12 @@ Define la función `compute` en `metric` para calcular el accuracy de tus predic
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `evaluation_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época:
+Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `eval_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época:
 
 ```py
 >>> from transformers import TrainingArguments
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/it/migration.md b/docs/source/it/migration.md
index 9a5f4d005505e8..07d31705784e7f 100644
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@@ -167,7 +167,7 @@ Per quanto riguarda la classe `Trainer`:
 - Il metodo `is_world_master` di `Trainer` è deprecato a favore di `is_world_process_zero`.
 
 Per quanto riguarda la classe `TrainingArguments`:
-- L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `evaluation_strategy`.
+- L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `eval_strategy`.
 
 Per quanto riguarda il modello Transfo-XL:
 - L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`.
diff --git a/docs/source/it/training.md b/docs/source/it/training.md
index 2a64cfca375f69..21008a92bf7c6f 100644
--- a/docs/source/it/training.md
+++ b/docs/source/it/training.md
@@ -121,12 +121,12 @@ Richiama `compute` su `metric` per calcolare l'accuratezza delle tue previsioni.
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `evaluation_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento:
+Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `eval_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento:
 
 ```py
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/ja/model_memory_anatomy.md b/docs/source/ja/model_memory_anatomy.md
index 5f09489b7f79aa..45a383d616ad34 100644
--- a/docs/source/ja/model_memory_anatomy.md
+++ b/docs/source/ja/model_memory_anatomy.md
@@ -136,7 +136,7 @@ Tue Jan 11 08:58:05 2022
 ```py
 default_args = {
     "output_dir": "tmp",
-    "evaluation_strategy": "steps",
+    "eval_strategy": "steps",
     "num_train_epochs": 1,
     "log_level": "error",
     "report_to": "none",
diff --git a/docs/source/ja/tasks/asr.md b/docs/source/ja/tasks/asr.md
index fd564abdc5c908..6d5f65461d215b 100644
--- a/docs/source/ja/tasks/asr.md
+++ b/docs/source/ja/tasks/asr.md
@@ -270,7 +270,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
 ...     gradient_checkpointing=True,
 ...     fp16=True,
 ...     group_by_length=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=8,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/ja/tasks/audio_classification.md b/docs/source/ja/tasks/audio_classification.md
index 58d42f3f4d4ff1..6f4d0dd171846a 100644
--- a/docs/source/ja/tasks/audio_classification.md
+++ b/docs/source/ja/tasks/audio_classification.md
@@ -221,7 +221,7 @@ MInDS-14 データセットのサンプリング レートは 8000khz です (
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_mind_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=3e-5,
 ...     per_device_train_batch_size=32,
diff --git a/docs/source/ja/tasks/document_question_answering.md b/docs/source/ja/tasks/document_question_answering.md
index 478c6af2235490..ec88f262086cf5 100644
--- a/docs/source/ja/tasks/document_question_answering.md
+++ b/docs/source/ja/tasks/document_question_answering.md
@@ -403,7 +403,7 @@ end_index 18
 ...     num_train_epochs=20,
 ...     save_steps=200,
 ...     logging_steps=50,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     learning_rate=5e-5,
 ...     save_total_limit=2,
 ...     remove_unused_columns=False,
diff --git a/docs/source/ja/tasks/image_captioning.md b/docs/source/ja/tasks/image_captioning.md
index 31c687c111c071..7649947b2c6450 100644
--- a/docs/source/ja/tasks/image_captioning.md
+++ b/docs/source/ja/tasks/image_captioning.md
@@ -194,7 +194,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=32,
     gradient_accumulation_steps=2,
     save_total_limit=3,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=50,
     save_strategy="steps",
     save_steps=50,
diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md
index f8d8d0d55238b9..f16e46c26fc316 100644
--- a/docs/source/ja/tasks/image_classification.md
+++ b/docs/source/ja/tasks/image_classification.md
@@ -308,7 +308,7 @@ food["test"].set_transform(preprocess_val)
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_food_model",
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=16,
diff --git a/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
index 16df6e3b9d9658..30c0dbbf063040 100644
--- a/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
@@ -112,7 +112,7 @@ training_args = TrainingArguments(
     fp16=True,
     logging_dir=f"{repo_name}/logs",
     logging_strategy="epoch",
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
     save_strategy="epoch",
     load_best_model_at_end=True,
     metric_for_best_model="accuracy",
diff --git a/docs/source/ja/tasks/language_modeling.md b/docs/source/ja/tasks/language_modeling.md
index 835a0d54ea4ffd..1d1bcab0b3757a 100644
--- a/docs/source/ja/tasks/language_modeling.md
+++ b/docs/source/ja/tasks/language_modeling.md
@@ -246,7 +246,7 @@ Apply the `group_texts` function over the entire dataset:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_clm-model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     weight_decay=0.01,
 ...     push_to_hub=True,
diff --git a/docs/source/ja/tasks/masked_language_modeling.md b/docs/source/ja/tasks/masked_language_modeling.md
index b0fff72f9b0e26..29488d5c71e44e 100644
--- a/docs/source/ja/tasks/masked_language_modeling.md
+++ b/docs/source/ja/tasks/masked_language_modeling.md
@@ -231,7 +231,7 @@ pip install transformers datasets evaluate
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_mlm_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     num_train_epochs=3,
 ...     weight_decay=0.01,
diff --git a/docs/source/ja/tasks/multiple_choice.md b/docs/source/ja/tasks/multiple_choice.md
index bfe5f388cb4ab6..045c9112932dba 100644
--- a/docs/source/ja/tasks/multiple_choice.md
+++ b/docs/source/ja/tasks/multiple_choice.md
@@ -266,7 +266,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_swag_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     learning_rate=5e-5,
diff --git a/docs/source/ja/tasks/question_answering.md b/docs/source/ja/tasks/question_answering.md
index 54df687c2f047f..d7feac56076ffa 100644
--- a/docs/source/ja/tasks/question_answering.md
+++ b/docs/source/ja/tasks/question_answering.md
@@ -220,7 +220,7 @@ pip install transformers datasets evaluate
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_qa_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md
index 2816688b4e1c14..572280c1962ede 100644
--- a/docs/source/ja/tasks/semantic_segmentation.md
+++ b/docs/source/ja/tasks/semantic_segmentation.md
@@ -323,7 +323,7 @@ pip install -q datasets transformers evaluate
 ...     per_device_train_batch_size=2,
 ...     per_device_eval_batch_size=2,
 ...     save_total_limit=3,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     save_strategy="steps",
 ...     save_steps=20,
 ...     eval_steps=20,
diff --git a/docs/source/ja/tasks/sequence_classification.md b/docs/source/ja/tasks/sequence_classification.md
index 767d5e03cdf607..c97644ca10fad6 100644
--- a/docs/source/ja/tasks/sequence_classification.md
+++ b/docs/source/ja/tasks/sequence_classification.md
@@ -324,7 +324,7 @@ pip install -q datasets transformers evaluate
 ...     per_device_train_batch_size=2,
 ...     per_device_eval_batch_size=2,
 ...     save_total_limit=3,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     save_strategy="steps",
 ...     save_steps=20,
 ...     eval_steps=20,
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
index a4b012d712f2e7..04f1a53d13f2c6 100644
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@@ -204,7 +204,7 @@ pip install transformers datasets evaluate rouge_score
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_billsum_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ja/tasks/text-to-speech.md b/docs/source/ja/tasks/text-to-speech.md
index 357ec18855149e..b302a19a0d5818 100644
--- a/docs/source/ja/tasks/text-to-speech.md
+++ b/docs/source/ja/tasks/text-to-speech.md
@@ -477,7 +477,7 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 ...     max_steps=4000,
 ...     gradient_checkpointing=True,
 ...     fp16=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=2,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/ja/tasks/token_classification.md b/docs/source/ja/tasks/token_classification.md
index 2b650c4a844d84..497584674252ad 100644
--- a/docs/source/ja/tasks/token_classification.md
+++ b/docs/source/ja/tasks/token_classification.md
@@ -288,7 +288,7 @@ pip install transformers datasets evaluate seqeval
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=2,
 ...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     push_to_hub=True,
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
index fb2c89f3856d49..ed5bd24777e14d 100644
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@@ -208,7 +208,7 @@ pip install transformers datasets evaluate sacrebleu
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_opus_books_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md
index e0c383619411bf..688cb701496f79 100644
--- a/docs/source/ja/tasks/video_classification.md
+++ b/docs/source/ja/tasks/video_classification.md
@@ -360,7 +360,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 >>> args = TrainingArguments(
 ...     new_model_name,
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
diff --git a/docs/source/ja/training.md b/docs/source/ja/training.md
index 79fbb1b7fb2571..9dd2369601c10a 100644
--- a/docs/source/ja/training.md
+++ b/docs/source/ja/training.md
@@ -135,12 +135,12 @@ BERTモデルの事前学習済みのヘッドは破棄され、ランダムに
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-評価メトリクスをファインチューニング中に監視したい場合、トレーニング引数で `evaluation_strategy` パラメータを指定して、各エポックの終了時に評価メトリクスを報告します：
+評価メトリクスをファインチューニング中に監視したい場合、トレーニング引数で `eval_strategy` パラメータを指定して、各エポックの終了時に評価メトリクスを報告します：
 
 ```python
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/ko/model_memory_anatomy.md b/docs/source/ko/model_memory_anatomy.md
index 5701e19aaa085d..a5c3a0f35292af 100644
--- a/docs/source/ko/model_memory_anatomy.md
+++ b/docs/source/ko/model_memory_anatomy.md
@@ -132,7 +132,7 @@ Tue Jan 11 08:58:05 2022
 ```py
 default_args = {
     "output_dir": "tmp",
-    "evaluation_strategy": "steps",
+    "eval_strategy": "steps",
     "num_train_epochs": 1,
     "log_level": "error",
     "report_to": "none",
diff --git a/docs/source/ko/tasks/asr.md b/docs/source/ko/tasks/asr.md
index 47a568ecf02bb4..474d60bf2d1a19 100644
--- a/docs/source/ko/tasks/asr.md
+++ b/docs/source/ko/tasks/asr.md
@@ -274,7 +274,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
 ...     gradient_checkpointing=True,
 ...     fp16=True,
 ...     group_by_length=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=8,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/ko/tasks/audio_classification.md b/docs/source/ko/tasks/audio_classification.md
index 7e1094815fd429..c9ef810e8ef4f4 100644
--- a/docs/source/ko/tasks/audio_classification.md
+++ b/docs/source/ko/tasks/audio_classification.md
@@ -221,7 +221,7 @@ MinDS-14 데이터 세트의 샘플링 속도는 8000khz이므로(이 정보는
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_mind_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=3e-5,
 ...     per_device_train_batch_size=32,
diff --git a/docs/source/ko/tasks/document_question_answering.md b/docs/source/ko/tasks/document_question_answering.md
index b9e98f3bf67235..920eb99ea52960 100644
--- a/docs/source/ko/tasks/document_question_answering.md
+++ b/docs/source/ko/tasks/document_question_answering.md
@@ -385,7 +385,7 @@ end_index 18
 ...     num_train_epochs=20,
 ...     save_steps=200,
 ...     logging_steps=50,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     learning_rate=5e-5,
 ...     save_total_limit=2,
 ...     remove_unused_columns=False,
diff --git a/docs/source/ko/tasks/image_captioning.md b/docs/source/ko/tasks/image_captioning.md
index c5139649a9185b..c4d0f99b6170ee 100644
--- a/docs/source/ko/tasks/image_captioning.md
+++ b/docs/source/ko/tasks/image_captioning.md
@@ -201,7 +201,7 @@ training_args = TrainingArguments(
     per_device_eval_batch_size=32,
     gradient_accumulation_steps=2,
     save_total_limit=3,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=50,
     save_strategy="steps",
     save_steps=50,
diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md
index 031e01ea5c5a83..d647b4512b038a 100644
--- a/docs/source/ko/tasks/image_classification.md
+++ b/docs/source/ko/tasks/image_classification.md
@@ -301,7 +301,7 @@ food["test"].set_transform(preprocess_val)
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_food_model",
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=16,
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
index ee1d11c1d09daf..b98c64dcc3adae 100644
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -233,7 +233,7 @@ pip install transformers datasets evaluate
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_clm-model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     weight_decay=0.01,
 ...     push_to_hub=True,
diff --git a/docs/source/ko/tasks/masked_language_modeling.md b/docs/source/ko/tasks/masked_language_modeling.md
index 3aafdf1cb9eebe..c710dbf168ed01 100644
--- a/docs/source/ko/tasks/masked_language_modeling.md
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -236,7 +236,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티와
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_eli5_mlm_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     num_train_epochs=3,
 ...     weight_decay=0.01,
diff --git a/docs/source/ko/tasks/multiple_choice.md b/docs/source/ko/tasks/multiple_choice.md
index 4e02f7fabe504f..b28654ea4f1438 100644
--- a/docs/source/ko/tasks/multiple_choice.md
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -265,7 +265,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_swag_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     learning_rate=5e-5,
diff --git a/docs/source/ko/tasks/question_answering.md b/docs/source/ko/tasks/question_answering.md
index 9539b9a403030e..7fe8ba3a5f08d0 100644
--- a/docs/source/ko/tasks/question_answering.md
+++ b/docs/source/ko/tasks/question_answering.md
@@ -215,7 +215,7 @@ pip install transformers datasets evaluate
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="my_awesome_qa_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md
index 4b6109d692bf10..0afa4bbe020f7c 100644
--- a/docs/source/ko/tasks/semantic_segmentation.md
+++ b/docs/source/ko/tasks/semantic_segmentation.md
@@ -317,7 +317,7 @@ pip install -q datasets transformers evaluate
 ...     per_device_train_batch_size=2,
 ...     per_device_eval_batch_size=2,
 ...     save_total_limit=3,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     save_strategy="steps",
 ...     save_steps=20,
 ...     eval_steps=20,
diff --git a/docs/source/ko/tasks/sequence_classification.md b/docs/source/ko/tasks/sequence_classification.md
index a1a5da50e9f614..9cf6b9f52433a3 100644
--- a/docs/source/ko/tasks/sequence_classification.md
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -185,7 +185,7 @@ tokenized_imdb = imdb.map(preprocess_function, batched=True)
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=2,
 ...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     push_to_hub=True,
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
index 43eae25d79f0aa..62e410757e464e 100644
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@@ -211,7 +211,7 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_billsum_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ko/tasks/token_classification.md b/docs/source/ko/tasks/token_classification.md
index 1e49d79a0d7235..5bb3989d45944f 100644
--- a/docs/source/ko/tasks/token_classification.md
+++ b/docs/source/ko/tasks/token_classification.md
@@ -288,7 +288,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=2,
 ...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     load_best_model_at_end=True,
 ...     push_to_hub=True,
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
index 6de275f7d04c80..60ca77ee1e41bb 100644
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@@ -209,7 +209,7 @@ pip install transformers datasets evaluate sacrebleu
 ```py
 >>> training_args = Seq2SeqTrainingArguments(
 ...     output_dir="my_awesome_opus_books_model",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md
index 01dbb0757b6608..762716c9ff7f8e 100644
--- a/docs/source/ko/tasks/video_classification.md
+++ b/docs/source/ko/tasks/video_classification.md
@@ -358,7 +358,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 >>> args = TrainingArguments(
 ...     new_model_name,
 ...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     save_strategy="epoch",
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
diff --git a/docs/source/ko/training.md b/docs/source/ko/training.md
index fa6d56bdc36696..432ba186c3df0c 100644
--- a/docs/source/ko/training.md
+++ b/docs/source/ko/training.md
@@ -129,12 +129,12 @@ rendered properly in your Markdown viewer.
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-미세 튜닝 중에 평가 지표를 모니터링하려면 훈련 인수에 `evaluation_strategy` 파라미터를 지정하여 각 에폭이 끝날 때 평가 지표를 확인할 수 있습니다:
+미세 튜닝 중에 평가 지표를 모니터링하려면 훈련 인수에 `eval_strategy` 파라미터를 지정하여 각 에폭이 끝날 때 평가 지표를 확인할 수 있습니다:
 
 ```py
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### 훈련 하기[[trainer]]
diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md
index 3465680dcc2046..d4d6bf4dd906ee 100644
--- a/docs/source/pt/tasks/token_classification.md
+++ b/docs/source/pt/tasks/token_classification.md
@@ -180,7 +180,7 @@ Nesse ponto, restam apenas três passos:
 ```py
 >>> training_args = TrainingArguments(
 ...     output_dir="./results",
-...     evaluation_strategy="epoch",
+...     eval_strategy="epoch",
 ...     learning_rate=2e-5,
 ...     per_device_train_batch_size=16,
 ...     per_device_eval_batch_size=16,
diff --git a/docs/source/pt/training.md b/docs/source/pt/training.md
index 49f57dead24233..67294baee35c1f 100644
--- a/docs/source/pt/training.md
+++ b/docs/source/pt/training.md
@@ -146,13 +146,13 @@ todos os modelos de 🤗 Transformers retornam logits).
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-Se quiser controlar as suas métricas de avaliação durante o fine-tuning, especifique o parâmetro `evaluation_strategy`
+Se quiser controlar as suas métricas de avaliação durante o fine-tuning, especifique o parâmetro `eval_strategy`
 nos seus argumentos de treinamento para que o modelo considere a métrica de avaliação ao final de cada época:
 
 ```py
 >>> from transformers import TrainingArguments
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### Trainer
diff --git a/docs/source/zh/tasks/asr.md b/docs/source/zh/tasks/asr.md
index 91fee0ab332ede..48ab94cb7d9503 100644
--- a/docs/source/zh/tasks/asr.md
+++ b/docs/source/zh/tasks/asr.md
@@ -288,7 +288,7 @@ Wav2Vec2 分词器仅训练了大写字符，因此您需要确保文本与分
 ...     gradient_checkpointing=True,
 ...     fp16=True,
 ...     group_by_length=True,
-...     evaluation_strategy="steps",
+...     eval_strategy="steps",
 ...     per_device_eval_batch_size=8,
 ...     save_steps=1000,
 ...     eval_steps=1000,
diff --git a/docs/source/zh/training.md b/docs/source/zh/training.md
index 773c58181c31e9..aeacf732c22f42 100644
--- a/docs/source/zh/training.md
+++ b/docs/source/zh/training.md
@@ -125,12 +125,12 @@ rendered properly in your Markdown viewer.
 ...     return metric.compute(predictions=predictions, references=labels)
 ```
 
-如果您希望在微调过程中监视评估指标，请在您的训练参数中指定 `evaluation_strategy` 参数，以在每个`epoch`结束时展示评估指标：
+如果您希望在微调过程中监视评估指标，请在您的训练参数中指定 `eval_strategy` 参数，以在每个`epoch`结束时展示评估指标：
 
 ```py
 >>> from transformers import TrainingArguments, Trainer
 
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
 ```
 
 ### 训练器
diff --git a/examples/flax/language-modeling/README.md b/examples/flax/language-modeling/README.md
index cb8671147ff98c..324c560ea4a7f3 100644
--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -490,7 +490,7 @@ python3 xla_spawn.py --num_cores ${NUM_TPUS} run_mlm.py --output_dir="./runs" \
     --do_train \
     --do_eval \
     --logging_steps="500" \
-    --evaluation_strategy="epoch" \
+    --eval_strategy="epoch" \
     --report_to="tensorboard" \
     --save_strategy="no"
 ```
@@ -538,7 +538,7 @@ python3 -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} run_mlm.py \
     --do_train \
     --do_eval \
     --logging_steps="500" \
-    --evaluation_strategy="steps" \
+    --eval_strategy="steps" \
     --report_to="tensorboard" \
     --save_strategy="no"
 ```
diff --git a/examples/legacy/seq2seq/finetune.sh b/examples/legacy/seq2seq/finetune.sh
index 1f518835d63859..60023df7bad6ae 100644
--- a/examples/legacy/seq2seq/finetune.sh
+++ b/examples/legacy/seq2seq/finetune.sh
@@ -18,7 +18,7 @@ python finetune_trainer.py \
     --learning_rate=3e-5 \
     --fp16 \
     --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --predict_with_generate \
     --n_val 1000 \
     "$@"
diff --git a/examples/legacy/seq2seq/finetune_tpu.sh b/examples/legacy/seq2seq/finetune_tpu.sh
index 68cf0d77360292..ef72b0953b440b 100644
--- a/examples/legacy/seq2seq/finetune_tpu.sh
+++ b/examples/legacy/seq2seq/finetune_tpu.sh
@@ -20,7 +20,7 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
     finetune_trainer.py \
     --learning_rate=3e-5 \
     --do_train --do_eval \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --prediction_loss_only \
     --n_val 1000 \
     "$@"
diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py
index 4e186c96d8c218..e269bc2474eca5 100755
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -271,7 +271,7 @@ def main():
             max_source_length=data_args.max_source_length,
             prefix=model.config.prefix or "",
         )
-        if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO
+        if training_args.do_eval or training_args.eval_strategy != EvaluationStrategy.NO
         else None
     )
     test_dataset = (
diff --git a/examples/legacy/seq2seq/train_distil_marian_enro.sh b/examples/legacy/seq2seq/train_distil_marian_enro.sh
index fc1b90595c5e69..5e86a6991c579e 100644
--- a/examples/legacy/seq2seq/train_distil_marian_enro.sh
+++ b/examples/legacy/seq2seq/train_distil_marian_enro.sh
@@ -32,7 +32,7 @@ python finetune_trainer.py \
     --max_source_length $MAX_LEN --max_target_length $MAX_LEN \
     --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
     --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --predict_with_generate --logging_first_step \
     --task translation --label_smoothing_factor 0.1 \
     "$@"
diff --git a/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh b/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh
index 2fce7684ab449d..00ef672261963b 100644
--- a/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh
+++ b/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh
@@ -33,7 +33,7 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
     --max_source_length $MAX_LEN --max_target_length $MAX_LEN \
     --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
     --do_train --do_eval \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --prediction_loss_only \
     --task translation --label_smoothing_factor 0.1 \
     "$@"
diff --git a/examples/legacy/seq2seq/train_distilbart_cnn.sh b/examples/legacy/seq2seq/train_distilbart_cnn.sh
index ec0aec8e597fb4..42f34e0cb6e75a 100644
--- a/examples/legacy/seq2seq/train_distilbart_cnn.sh
+++ b/examples/legacy/seq2seq/train_distilbart_cnn.sh
@@ -34,6 +34,6 @@ python finetune_trainer.py \
     --logging_first_step \
     --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN\
     --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --predict_with_generate --sortish_sampler \
     "$@"
diff --git a/examples/legacy/seq2seq/train_mbart_cc25_enro.sh b/examples/legacy/seq2seq/train_mbart_cc25_enro.sh
index 2b603eda7c35e6..63c8051b47def1 100644
--- a/examples/legacy/seq2seq/train_mbart_cc25_enro.sh
+++ b/examples/legacy/seq2seq/train_mbart_cc25_enro.sh
@@ -29,7 +29,7 @@ python finetune_trainer.py \
     --num_train_epochs 6 \
     --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
     --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --eval_strategy steps \
     --predict_with_generate --logging_first_step \
     --task translation \
     "$@"
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 63a56a06e8d5a4..c2f89f2477e691 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -283,7 +283,7 @@ To enable Neptune logging, in your `TrainingArguments`, set the `report_to` argu
 ```python
 training_args = TrainingArguments(
     "quick-training-distilbert-mrpc",
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=20,
     report_to="neptune",
 )
diff --git a/examples/pytorch/audio-classification/README.md b/examples/pytorch/audio-classification/README.md
index cc669a0894e14d..bc4581089c3fd2 100644
--- a/examples/pytorch/audio-classification/README.md
+++ b/examples/pytorch/audio-classification/README.md
@@ -50,7 +50,7 @@ python run_audio_classification.py \
     --dataloader_num_workers 4 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --metric_for_best_model accuracy \
@@ -92,7 +92,7 @@ python run_audio_classification.py \
     --dataloader_num_workers 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --metric_for_best_model accuracy \
diff --git a/examples/pytorch/image-classification/README.md b/examples/pytorch/image-classification/README.md
index 112cc51764a38e..62996ee19e375a 100644
--- a/examples/pytorch/image-classification/README.md
+++ b/examples/pytorch/image-classification/README.md
@@ -52,7 +52,7 @@ python run_image_classification.py \
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --save_total_limit 3 \
diff --git a/examples/pytorch/image-pretraining/README.md b/examples/pytorch/image-pretraining/README.md
index 65bb863f38b6ce..88c71e643e4c24 100644
--- a/examples/pytorch/image-pretraining/README.md
+++ b/examples/pytorch/image-pretraining/README.md
@@ -56,7 +56,7 @@ Alternatively, one can decide to further pre-train an already pre-trained (or fi
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --save_total_limit 3 \
@@ -106,7 +106,7 @@ Next, we can run the script by providing the path to this custom configuration (
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --save_total_limit 3 \
@@ -172,7 +172,7 @@ python run_mae.py \
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --save_total_limit 3 \
diff --git a/examples/pytorch/semantic-segmentation/README.md b/examples/pytorch/semantic-segmentation/README.md
index a0f830e16e915c..0be42d4fe84483 100644
--- a/examples/pytorch/semantic-segmentation/README.md
+++ b/examples/pytorch/semantic-segmentation/README.md
@@ -118,7 +118,7 @@ python run_semantic_segmentation.py \
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 100 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --seed 1337
 ```
diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index 8dbfcafe3405f9..b9cab9513bd446 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -76,7 +76,7 @@ python run_speech_recognition_ctc.py \
 	--gradient_accumulation_steps="2" \
 	--learning_rate="3e-4" \
 	--warmup_steps="500" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--length_column_name="input_length" \
 	--save_steps="400" \
@@ -111,7 +111,7 @@ torchrun \
 	--per_device_train_batch_size="4" \
 	--learning_rate="3e-4" \
 	--warmup_steps="500" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--length_column_name="input_length" \
 	--save_steps="400" \
@@ -162,7 +162,7 @@ However, the `--shuffle_buffer_size` argument controls how many examples we can
 	--gradient_accumulation_steps="2" \
 	--learning_rate="5e-4" \
 	--warmup_steps="500" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--save_steps="500" \
 	--eval_steps="500" \
@@ -293,7 +293,7 @@ python run_speech_recognition_ctc.py \
 	--per_device_train_batch_size="32" \
 	--learning_rate="1e-3" \
 	--warmup_steps="100" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--length_column_name="input_length" \
 	--save_steps="200" \
@@ -330,7 +330,7 @@ python run_speech_recognition_ctc.py \
 	--per_device_train_batch_size="32" \
 	--learning_rate="1e-3" \
 	--warmup_steps="100" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--length_column_name="input_length" \
 	--save_steps="200" \
@@ -378,7 +378,7 @@ python run_speech_recognition_seq2seq.py \
 	--logging_steps="25" \
 	--learning_rate="1e-5" \
 	--warmup_steps="500" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--eval_steps="1000" \
 	--save_strategy="steps" \
 	--save_steps="1000" \
@@ -419,7 +419,7 @@ torchrun \
 	--logging_steps="25" \
 	--learning_rate="1e-5" \
 	--warmup_steps="500" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--eval_steps="1000" \
 	--save_strategy="steps" \
 	--save_steps="1000" \
@@ -547,7 +547,7 @@ python run_speech_recognition_seq2seq.py \
 	--gradient_accumulation_steps="8" \
 	--learning_rate="3e-4" \
 	--warmup_steps="400" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="text" \
 	--save_steps="400" \
 	--eval_steps="400" \
@@ -589,7 +589,7 @@ torchrun \
 	--gradient_accumulation_steps="1" \
 	--learning_rate="3e-4" \
 	--warmup_steps="400" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="text" \
 	--save_steps="400" \
 	--eval_steps="400" \
diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index 927a15f9be679f..de06b988db634c 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -100,7 +100,7 @@ def tokenize(example):
         output_dir=args.output_dir,
         learning_rate=args.learning_rate,
         lr_scheduler_type=args.lr_scheduler_type,
-        evaluation_strategy="epoch",
+        eval_strategy="epoch",
         save_strategy="epoch",
         logging_strategy="epoch",
         per_device_train_batch_size=args.batch_size,
diff --git a/examples/research_projects/layoutlmv3/README.md b/examples/research_projects/layoutlmv3/README.md
index 17bf4bb67cd90f..2cc0fb75bd2c16 100644
--- a/examples/research_projects/layoutlmv3/README.md
+++ b/examples/research_projects/layoutlmv3/README.md
@@ -32,7 +32,7 @@ python run_funsd_cord.py \
   --do_train \
   --do_eval \
   --max_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --eval_steps 100 \
   --learning_rate 1e-5 \
   --load_best_model_at_end \
@@ -57,7 +57,7 @@ python run_funsd_cord.py \
   --do_train \
   --do_eval \
   --max_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --eval_steps 100 \
   --learning_rate 5e-5 \
   --load_best_model_at_end \
diff --git a/examples/research_projects/robust-speech-event/README.md b/examples/research_projects/robust-speech-event/README.md
index 5c7bf42a00445a..ca3c5cdecdecea 100644
--- a/examples/research_projects/robust-speech-event/README.md
+++ b/examples/research_projects/robust-speech-event/README.md
@@ -362,7 +362,7 @@ echo '''python run_speech_recognition_ctc.py \
 	--per_device_train_batch_size="2" \
 	--learning_rate="3e-4" \
 	--save_total_limit="1" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--length_column_name="input_length" \
 	--save_steps="5" \
@@ -438,7 +438,7 @@ echo '''python run_speech_recognition_ctc.py \
 	--learning_rate="7.5e-5" \
 	--warmup_steps="2000" \
 	--length_column_name="input_length" \
-	--evaluation_strategy="steps" \
+	--eval_strategy="steps" \
 	--text_column_name="sentence" \
 	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
 	--save_steps="500" \
diff --git a/examples/research_projects/self-training-text-classification/README.md b/examples/research_projects/self-training-text-classification/README.md
index 7e0f3f97148ee6..062d5de7afd057 100644
--- a/examples/research_projects/self-training-text-classification/README.md
+++ b/examples/research_projects/self-training-text-classification/README.md
@@ -51,7 +51,7 @@ parameters_dict = {
     'train_file': os.path.join(data_dir, 'train.csv'),
     'infer_file': os.path.join(data_dir, 'infer.csv'),
     'eval_file': os.path.join(data_dir, 'eval.csv'),
-    'evaluation_strategy': 'steps',
+    'eval_strategy': 'steps',
     'task_name': 'scitail',
     'label_list': ['entails', 'neutral'],
     'per_device_train_batch_size': 32,
diff --git a/examples/research_projects/self-training-text-classification/finetuning.py b/examples/research_projects/self-training-text-classification/finetuning.py
index eeb0a285dff987..0afff6a91eadca 100644
--- a/examples/research_projects/self-training-text-classification/finetuning.py
+++ b/examples/research_projects/self-training-text-classification/finetuning.py
@@ -190,7 +190,7 @@ class FTTrainingArguments:
             )
         },
     )
-    evaluation_strategy: Optional[str] = dataclasses.field(
+    eval_strategy: Optional[str] = dataclasses.field(
         default="no",
         metadata={
             "help": 'The evaluation strategy to adopt during training. Possible values are: ["no", "step", "epoch]'
@@ -198,7 +198,7 @@ class FTTrainingArguments:
     )
     eval_steps: Optional[int] = dataclasses.field(
         default=1,
-        metadata={"help": 'Number of update steps between two evaluations if `evaluation_strategy="steps"`.'},
+        metadata={"help": 'Number of update steps between two evaluations if `eval_strategy="steps"`.'},
     )
     eval_metric: Optional[str] = dataclasses.field(
         default="accuracy", metadata={"help": "The evaluation metric used for the task."}
@@ -265,7 +265,7 @@ def train(args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_s
                 # Evaluate during training
                 if (
                     eval_dataloader is not None
-                    and args.evaluation_strategy == IntervalStrategy.STEPS.value
+                    and args.eval_strategy == IntervalStrategy.STEPS.value
                     and args.eval_steps > 0
                     and completed_steps % args.eval_steps == 0
                 ):
@@ -331,7 +331,7 @@ def train(args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_s
                 break
 
         # Evaluate during training
-        if eval_dataloader is not None and args.evaluation_strategy == IntervalStrategy.EPOCH.value:
+        if eval_dataloader is not None and args.eval_strategy == IntervalStrategy.EPOCH.value:
             accelerator.wait_for_everyone()
             new_checkpoint = f"checkpoint-{IntervalStrategy.EPOCH.value}-{epoch}"
             new_eval_result = evaluate(args, accelerator, eval_dataloader, "eval", model, new_checkpoint)[
@@ -571,7 +571,7 @@ def finetune(accelerator, model_name_or_path, train_file, output_dir, **kwargs):
     assert args.train_file is not None
     data_files[Split.TRAIN.value] = args.train_file
 
-    if args.do_eval or args.evaluation_strategy != IntervalStrategy.NO.value:
+    if args.do_eval or args.eval_strategy != IntervalStrategy.NO.value:
         assert args.eval_file is not None
         data_files[Split.EVAL.value] = args.eval_file
 
diff --git a/examples/research_projects/self-training-text-classification/run.sh b/examples/research_projects/self-training-text-classification/run.sh
index 435a41461801e6..34e91d7c127c89 100755
--- a/examples/research_projects/self-training-text-classification/run.sh
+++ b/examples/research_projects/self-training-text-classification/run.sh
@@ -60,7 +60,7 @@ parameters_dict = {
   'train_file': os.path.join(data_dir, '${TRAIN_FILE}'),
   'infer_file': os.path.join(data_dir, '${INFER_FILE}'),
   'eval_file': os.path.join(data_dir, '${EVAL_FILE}'),
-  'evaluation_strategy': 'steps',
+  'eval_strategy': 'steps',
   'task_name': 'scitail',
   'label_list': ['entails', 'neutral'],
   'per_device_train_batch_size': 32,
diff --git a/examples/research_projects/self-training-text-classification/selftraining.py b/examples/research_projects/self-training-text-classification/selftraining.py
index 70a6c2f319e0cb..d741225b061e88 100644
--- a/examples/research_projects/self-training-text-classification/selftraining.py
+++ b/examples/research_projects/self-training-text-classification/selftraining.py
@@ -79,7 +79,7 @@ class STTrainingArguments:
     eval_metric: Optional[str] = dataclasses.field(
         default="accuracy", metadata={"help": "The evaluation metric used for the task."}
     )
-    evaluation_strategy: Optional[str] = dataclasses.field(
+    eval_strategy: Optional[str] = dataclasses.field(
         default="no",
         metadata={
             "help": 'The evaluation strategy to adopt during training. Possible values are: ["no", "step", "epoch]'
@@ -208,7 +208,7 @@ def selftrain(model_name_or_path, train_file, infer_file, output_dir, **kwargs):
     data_files["train"] = args.train_file
     data_files["infer"] = args.infer_file
 
-    if args.evaluation_strategy != IntervalStrategy.NO.value:
+    if args.eval_strategy != IntervalStrategy.NO.value:
         assert args.eval_file is not None
         data_files["eval"] = args.eval_file
 
@@ -267,7 +267,7 @@ def selftrain(model_name_or_path, train_file, infer_file, output_dir, **kwargs):
             "label_list": args.label_list,
             "output_dir": current_output_dir,
             "eval_metric": args.eval_metric,
-            "evaluation_strategy": args.evaluation_strategy,
+            "eval_strategy": args.eval_strategy,
             "early_stopping_patience": args.early_stopping_patience,
             "early_stopping_threshold": args.early_stopping_threshold,
             "seed": args.seed,
@@ -341,7 +341,7 @@ def selftrain(model_name_or_path, train_file, infer_file, output_dir, **kwargs):
 
         data_files["train_pseudo"] = os.path.join(next_data_dir, f"train_pseudo.{args.data_file_extension}")
 
-        if args.evaluation_strategy != IntervalStrategy.NO.value:
+        if args.eval_strategy != IntervalStrategy.NO.value:
             new_eval_result = eval_result
 
             if best_iteration is None:
diff --git a/examples/research_projects/tapex/README.md b/examples/research_projects/tapex/README.md
index 7d98901e281e65..b98eb9b428d01c 100644
--- a/examples/research_projects/tapex/README.md
+++ b/examples/research_projects/tapex/README.md
@@ -71,7 +71,7 @@ python run_wikisql_with_tapex.py \
   --eval_steps 1000 \
   --save_steps 1000 \
   --warmup_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --predict_with_generate \
   --num_beams 5 \
   --weight_decay 1e-2 \
@@ -101,7 +101,7 @@ python run_wikisql_with_tapex.py \
   --eval_steps 1000 \
   --save_steps 1000 \
   --warmup_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --predict_with_generate \
   --num_beams 5 \
   --weight_decay 1e-2 \
@@ -132,7 +132,7 @@ python run_wikitablequestions_with_tapex.py \
   --eval_steps 1000 \
   --save_steps 1000 \
   --warmup_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --predict_with_generate \
   --num_beams 5 \
   --weight_decay 1e-2 \
@@ -162,7 +162,7 @@ python run_wikitablequestions_with_tapex.py \
   --eval_steps 1000 \
   --save_steps 1000 \
   --warmup_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --predict_with_generate \
   --num_beams 5 \
   --weight_decay 1e-2 \
@@ -223,7 +223,7 @@ python run_tabfact_with_tapex.py \
   --learning_rate 3e-5 \
   --eval_steps 1000 \
   --save_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --weight_decay 1e-2 \
   --max_steps 30000 \
   --max_grad_norm 0.1
@@ -252,7 +252,7 @@ python run_tabfact_with_tapex.py \
   --learning_rate 3e-5 \
   --eval_steps 1000 \
   --save_steps 1000 \
-  --evaluation_strategy steps \
+  --eval_strategy steps \
   --weight_decay 1e-2 \
   --max_steps 30000 \
   --max_grad_norm 0.1
diff --git a/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md b/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
index 52553532fe08ab..7a580a36132441 100644
--- a/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
+++ b/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
@@ -182,7 +182,7 @@ Here we will run the script on the *Turkish* Common Voice dataset for demonstrat
 		--per_device_train_batch_size="16" \
 		--learning_rate="3e-4" \
 		--warmup_steps="500" \
-		--evaluation_strategy="steps" \
+		--eval_strategy="steps" \
 		--save_steps="400" \
 		--eval_steps="400" \
 		--logging_steps="400" \
@@ -209,7 +209,7 @@ Here we will run the script on the *Turkish* Common Voice dataset for demonstrat
 		--per_device_train_batch_size="16" \
 		--learning_rate="3e-4" \
 		--warmup_steps="500" \
-		--evaluation_strategy="steps" \
+		--eval_strategy="steps" \
 		--save_steps="400" \
 		--eval_steps="400" \
 		--logging_steps="400" \
diff --git a/examples/research_projects/wav2vec2/README.md b/examples/research_projects/wav2vec2/README.md
index cc667d6567ff95..88f62778a3add9 100644
--- a/examples/research_projects/wav2vec2/README.md
+++ b/examples/research_projects/wav2vec2/README.md
@@ -18,7 +18,7 @@ python run_asr.py \
 --num_train_epochs="30" \
 --per_device_train_batch_size="20" \
 --per_device_eval_batch_size="20" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_steps="500" \
 --eval_steps="100" \
 --logging_steps="50" \
@@ -73,7 +73,7 @@ python run_asr.py \
 --per_device_train_batch_size="1" \
 --per_device_eval_batch_size="1" \
 --gradient_accumulation_steps="8" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_steps="500" \
 --eval_steps="100" \
 --logging_steps="50" \
@@ -152,7 +152,7 @@ ZeRO-2:
 PYTHONPATH=../../../src deepspeed --num_gpus 2 \
 run_asr.py \
 --output_dir=output_dir --num_train_epochs=2 --per_device_train_batch_size=2 \
---per_device_eval_batch_size=2 --evaluation_strategy=steps --save_steps=500 --eval_steps=100 \
+--per_device_eval_batch_size=2 --eval_strategy=steps --save_steps=500 --eval_steps=100 \
 --logging_steps=5 --learning_rate=5e-4 --warmup_steps=3000 \
 --model_name_or_path=patrickvonplaten/wav2vec2_tiny_random_robust \
 --dataset_name=hf-internal-testing/librispeech_asr_dummy --dataset_config_name=clean \
@@ -176,7 +176,7 @@ ZeRO-3:
 PYTHONPATH=../../../src deepspeed --num_gpus 2 \
 run_asr.py \
 --output_dir=output_dir --num_train_epochs=2 --per_device_train_batch_size=2 \
---per_device_eval_batch_size=2 --evaluation_strategy=steps --save_steps=500 --eval_steps=100 \
+--per_device_eval_batch_size=2 --eval_strategy=steps --save_steps=500 --eval_steps=100 \
 --logging_steps=5 --learning_rate=5e-4 --warmup_steps=3000 \
 --model_name_or_path=patrickvonplaten/wav2vec2_tiny_random_robust \
 --dataset_name=hf-internal-testing/librispeech_asr_dummy --dataset_config_name=clean \
diff --git a/examples/research_projects/wav2vec2/finetune_base_100.sh b/examples/research_projects/wav2vec2/finetune_base_100.sh
index 8002dd81235f9e..254b0afef3d62e 100755
--- a/examples/research_projects/wav2vec2/finetune_base_100.sh
+++ b/examples/research_projects/wav2vec2/finetune_base_100.sh
@@ -4,7 +4,7 @@ python run_asr.py \
 --num_train_epochs="30" \
 --per_device_train_batch_size="32" \
 --per_device_eval_batch_size="32" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_total_limit="3" \
 --save_steps="500" \
 --eval_steps="100" \
diff --git a/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh b/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
index 6219e26b642f63..508cb532b0f08d 100755
--- a/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
+++ b/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
@@ -4,7 +4,7 @@ python run_asr.py \
 --num_train_epochs="30" \
 --per_device_train_batch_size="20" \
 --per_device_eval_batch_size="20" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_steps="500" \
 --eval_steps="100" \
 --logging_steps="50" \
diff --git a/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh b/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
index 3d2423df970c8e..6956b093e72530 100755
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
@@ -4,7 +4,7 @@ python run_asr.py \
 --num_train_epochs="30" \
 --per_device_train_batch_size="16" \
 --per_device_eval_batch_size="16" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_total_limit="3" \
 --save_steps="500" \
 --eval_steps="100" \
diff --git a/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh b/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
index eb9671d015271e..fa02e71ea82c68 100755
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
@@ -5,7 +5,7 @@ python run_asr.py \
 --per_device_train_batch_size="2" \
 --per_device_eval_batch_size="2" \
 --gradient_accumulation_steps="4" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_steps="500" \
 --eval_steps="100" \
 --logging_steps="50" \
diff --git a/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh b/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
index 9b325c42771e64..e90bc8caa6c001 100755
--- a/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
@@ -5,7 +5,7 @@ python run_asr.py \
 --per_device_train_batch_size="1" \
 --per_device_eval_batch_size="1" \
 --gradient_accumulation_steps="8" \
---evaluation_strategy="steps" \
+--eval_strategy="steps" \
 --save_steps="500" \
 --eval_steps="100" \
 --logging_steps="50" \
diff --git a/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh b/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
index 0726bb09eb51e2..70da0e0a0d1219 100644
--- a/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
+++ b/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
@@ -6,7 +6,7 @@ python run_common_voice.py \
     --overwrite_output_dir \
     --num_train_epochs="5" \
     --per_device_train_batch_size="16" \
-    --evaluation_strategy="steps" \
+    --eval_strategy="steps" \
     --learning_rate="3e-4" \
     --warmup_steps="500" \
     --fp16 \
diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
index d44145f3e0c12f..8fb2df71112594 100644
--- a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
+++ b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
@@ -161,7 +161,7 @@ def run_trainer(
             --num_train_epochs {str(num_train_epochs)}
             --per_device_train_batch_size 2
             --per_device_eval_batch_size 2
-            --evaluation_strategy steps
+            --eval_strategy steps
             --learning_rate 5e-4
             --warmup_steps 8
             --orthography timit
diff --git a/examples/research_projects/xtreme-s/README.md b/examples/research_projects/xtreme-s/README.md
index dc7e783c75d124..5314ba9880ad35 100644
--- a/examples/research_projects/xtreme-s/README.md
+++ b/examples/research_projects/xtreme-s/README.md
@@ -90,7 +90,7 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps=2 \
     --learning_rate="3e-4" \
     --warmup_steps=3000 \
-    --evaluation_strategy="steps" \
+    --eval_strategy="steps" \
     --max_duration_in_seconds=20 \
     --save_steps=500 \
     --eval_steps=500 \
@@ -134,7 +134,7 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps=1 \
     --learning_rate="3e-4" \
     --warmup_steps=1500 \
-    --evaluation_strategy="steps" \
+    --eval_strategy="steps" \
     --max_duration_in_seconds=30 \
     --save_steps=200 \
     --eval_steps=200 \
diff --git a/examples/tensorflow/image-classification/README.md b/examples/tensorflow/image-classification/README.md
index 96979330ddc5b5..a343b443ef1ae5 100644
--- a/examples/tensorflow/image-classification/README.md
+++ b/examples/tensorflow/image-classification/README.md
@@ -45,7 +45,7 @@ python run_image_classification.py \
     --per_device_eval_batch_size 8 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --load_best_model_at_end True \
     --save_total_limit 3 \
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 5520d07c5a5ffc..d8722c67836f26 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -113,7 +113,7 @@ def _compute_metrics(pred):
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
             predict_with_generate=True,
-            evaluation_strategy="steps",
+            eval_strategy="steps",
             do_train=True,
             do_eval=True,
             warmup_steps=0,

From f6cd796a517a52d78f0a7944de09b977c3a33b10 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 16 Apr 2024 11:48:48 -0400
Subject: [PATCH 5/8] Clean

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 5da606f34a2b58..a0a8ce4d4d5f2b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -18,7 +18,7 @@
 import math
 import os
 import warnings
-from dataclasses import InitVar, asdict, dataclass, field, fields
+from dataclasses import asdict, dataclass, field, fields
 from datetime import timedelta
 from enum import Enum
 from pathlib import Path

From 4f1654ccba6b8a89ba8e6bf686f7b1ba8c8c4202 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 16 Apr 2024 12:04:57 -0400
Subject: [PATCH 6/8] Change typing?

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a0a8ce4d4d5f2b..ef54fb8b7b307b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1269,7 +1269,7 @@ class TrainingArguments:
             "choices": ["auto", "apex", "cpu_amp"],
         },
     )
-    evaluation_strategy: Union[IntervalStrategy, str] = field(
+    evaluation_strategy: Optional[str] = field(
         default=None,
         metadata={"help": "Deprecated. Use `eval_strategy` instead"},
     )

From 53f83a5884004f638fd05bc6b8c8a1dea79fad26 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 16 Apr 2024 12:17:26 -0400
Subject: [PATCH 7/8] Fix tests

---
 src/transformers/training_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index ef54fb8b7b307b..7f0515975799b0 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -734,7 +734,7 @@ class TrainingArguments:
     do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
     do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
     eval_strategy: Union[IntervalStrategy, str] = field(
-        default=None,
+        default="no",
         metadata={"help": "The evaluation strategy to use."},
     )
     prediction_loss_only: bool = field(
@@ -1269,7 +1269,7 @@ class TrainingArguments:
             "choices": ["auto", "apex", "cpu_amp"],
         },
     )
-    evaluation_strategy: Optional[str] = field(
+    evaluation_strategy: Union[IntervalStrategy, str] = field(
         default=None,
         metadata={"help": "Deprecated. Use `eval_strategy` instead"},
     )

From 3a4af4d2770b05a144dce6b9b324870fa54c65cc Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 18 Apr 2024 12:27:11 -0400
Subject: [PATCH 8/8] Deprecation versions

---
 src/transformers/training_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 7f0515975799b0..57061182757030 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1399,14 +1399,14 @@ def __post_init__(self):
 
         if self.evaluation_strategy is not None:
             warnings.warn(
-                "`evaluation_strategy` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use `eval_strategy` instead",
+                "`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead",
                 FutureWarning,
             )
             self.eval_strategy = self.evaluation_strategy
 
         if isinstance(self.eval_strategy, EvaluationStrategy):
             warnings.warn(
-                "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5"
+                "using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5"
                 " of 🤗 Transformers. Use `IntervalStrategy` instead",
                 FutureWarning,
             )