From 17b141a697337fd1b553d9a4fb3484ca501a1f5d Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 10 Apr 2024 08:38:26 -0400
Subject: [PATCH 1/6] Enforce saving at end of training

---
 src/transformers/trainer_callback.py |  3 ++
 src/transformers/training_args.py    |  3 ++
 tests/trainer/test_trainer.py        | 44 ++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index a21c46fea9fe2a..5d005384de41d2 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -544,6 +544,9 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # End training
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
+            # Save the best model at the end if we have a save strategy
+            if args.save_strategy != IntervalStrategy.NO:
+                control.should_save = True
 
         return control
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 2807c9951aa6d6..5ae32c451d49fe 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -333,6 +333,9 @@ class TrainingArguments:
                 - `"no"`: No save is done during training.
                 - `"epoch"`: Save is done at the end of each epoch.
                 - `"steps"`: Save is done every `save_steps`.
+
+                If `"epoch"` or `"steps"` is chosen, saving will also be performed at the
+                very end of training, always.
         save_steps (`int` or `float`, *optional*, defaults to 500):
             Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
             float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c420da4052f186..952dbb90d7f809 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1968,6 +1968,50 @@ def test_safe_checkpoints(self):
                     tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
                 )
 
+    def test_load_best_model_with_save(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                save_steps=5,
+                evaluation_strategy="steps",
+                eval_steps=5,
+            )
+            trainer.train()
+            # Check that we have the last known step:
+            assert os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
+            ), f"Could not find checkpoint-{trainer.state.max_steps}"
+            # And then check the last multiple
+            last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
+            assert os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{last_multiple}")
+            ), f"Could not find checkpoint-{last_multiple}"
+
+        # Now test that using a limit works
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                save_steps=5,
+                evaluation_strategy="steps",
+                eval_steps=5,
+                load_best_model_at_end=True,
+                save_total_limit=2,
+            )
+            trainer.train()
+            # Check that we have the last known step:
+            assert os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
+            ), f"Could not find checkpoint-{trainer.state.max_steps}"
+            # And then check the last multiple
+            last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
+            assert os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{last_multiple}")
+            ), f"Could not find checkpoint-{last_multiple}"
+            # Finally check that we don't have an old one
+            assert not os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps-10}")
+            ), f"Found checkpoint-{trainer.state.max_steps-10}, limit not respected"
+
     @require_torch_multi_accelerator
     def test_run_seq2seq_double_train_wrap_once(self):
         # test that we don't wrap the model more than once

From 57dbe85fb98d0ac24204bd3ed917e6902ee9636a Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 10 Apr 2024 08:53:54 -0400
Subject: [PATCH 2/6] Fix test

---
 tests/trainer/test_trainer_callback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index 8c0c9367d8d779..9eeb1d5e412e17 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -153,7 +153,7 @@ def get_expected_events(self, trainer):
                     expected_events.append("on_log")
                 if trainer.args.eval_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0:
                     expected_events += evaluation_events.copy()
-                if step % trainer.args.save_steps == 0:
+                if step % trainer.args.save_steps == 0 or step == trainer.state.max_steps:
                     expected_events.append("on_save")
             expected_events.append("on_epoch_end")
             if trainer.args.eval_strategy == IntervalStrategy.EPOCH:

From b73f1a4c1f2ddbd4845c718a1d6581eb28478797 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 29 Apr 2024 13:28:37 -0400
Subject: [PATCH 3/6] Rework test

---
 tests/trainer/test_trainer.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 952dbb90d7f809..362c4a3bc17048 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -34,6 +34,7 @@
 from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files
 from parameterized import parameterized
 from requests.exceptions import HTTPError
+from safetensors.torch import load_file
 
 from transformers import (
     AutoTokenizer,
@@ -1975,19 +1976,21 @@ def test_load_best_model_with_save(self):
                 save_steps=5,
                 evaluation_strategy="steps",
                 eval_steps=5,
+                max_steps=9,
             )
             trainer.train()
             # Check that we have the last known step:
             assert os.path.exists(
                 os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
             ), f"Could not find checkpoint-{trainer.state.max_steps}"
-            # And then check the last multiple
-            last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{last_multiple}")
-            ), f"Could not find checkpoint-{last_multiple}"
+            # And then check the last step
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
 
         # Now test that using a limit works
+        # Should result in:
+        # - save at step 5 (but is deleted)
+        # - save at step 10 (loaded in at the end when `load_best_model=True`)
+        # - save at step 11
         with tempfile.TemporaryDirectory() as tmpdir:
             trainer = get_regression_trainer(
                 output_dir=tmpdir,
@@ -1996,21 +1999,23 @@ def test_load_best_model_with_save(self):
                 eval_steps=5,
                 load_best_model_at_end=True,
                 save_total_limit=2,
+                max_steps=11,
             )
             trainer.train()
             # Check that we have the last known step:
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
-            ), f"Could not find checkpoint-{trainer.state.max_steps}"
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
             # And then check the last multiple
-            last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{last_multiple}")
-            ), f"Could not find checkpoint-{last_multiple}"
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
             # Finally check that we don't have an old one
-            assert not os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps-10}")
-            ), f"Found checkpoint-{trainer.state.max_steps-10}, limit not respected"
+            assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
+
+            # Finally check that the right model was loaded in, checkpoint-10
+            # this goes by the last `eval` step check to do so, so it won't be
+            # the last model *saved*
+            model_state = trainer.model.state_dict()
+            final_model_weights = load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors"))
+            for k, v in model_state.items():
+                assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
 
     @require_torch_multi_accelerator
     def test_run_seq2seq_double_train_wrap_once(self):

From 370e6cfe8dba74ddfcb469b757334b12a233e407 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 29 Apr 2024 14:25:39 -0400
Subject: [PATCH 4/6] Fixup tests'

---
 tests/trainer/test_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 362c4a3bc17048..9e669dd53badea 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -34,7 +34,6 @@
 from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files
 from parameterized import parameterized
 from requests.exceptions import HTTPError
-from safetensors.torch import load_file
 
 from transformers import (
     AutoTokenizer,
@@ -128,6 +127,7 @@
 
     if is_safetensors_available():
         import safetensors.torch
+        
 
 # for version specific tests in TrainerIntegrationTest
 require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28")
@@ -2013,7 +2013,7 @@ def test_load_best_model_with_save(self):
             # this goes by the last `eval` step check to do so, so it won't be
             # the last model *saved*
             model_state = trainer.model.state_dict()
-            final_model_weights = load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors"))
+            final_model_weights = safetensors.torch.load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors"))
             for k, v in model_state.items():
                 assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
 

From 4002cb85b5f4bd84ceb445867c9b287d6edeed1e Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 29 Apr 2024 14:26:47 -0400
Subject: [PATCH 5/6] Update comment based on sourab feedback

---
 src/transformers/trainer_callback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 5d005384de41d2..d2570ed8ba4317 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -544,7 +544,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # End training
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
-            # Save the best model at the end if we have a save strategy
+            # Save the model at the end if we have a save strategy
             if args.save_strategy != IntervalStrategy.NO:
                 control.should_save = True
 

From 95b018370433bf582e3d726f791829de81259355 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 15 May 2024 14:51:34 -0400
Subject: [PATCH 6/6] Clean

---
 tests/trainer/test_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9e669dd53badea..3aa95ad0e3ba87 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -127,7 +127,7 @@
 
     if is_safetensors_available():
         import safetensors.torch
-        
+
 
 # for version specific tests in TrainerIntegrationTest
 require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28")
@@ -2013,7 +2013,9 @@ def test_load_best_model_with_save(self):
             # this goes by the last `eval` step check to do so, so it won't be
             # the last model *saved*
             model_state = trainer.model.state_dict()
-            final_model_weights = safetensors.torch.load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors"))
+            final_model_weights = safetensors.torch.load_file(
+                os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
+            )
             for k, v in model_state.items():
                 assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"