From 17b141a697337fd1b553d9a4fb3484ca501a1f5d Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 10 Apr 2024 08:38:26 -0400 Subject: [PATCH 1/6] Enforce saving at end of training --- src/transformers/trainer_callback.py | 3 ++ src/transformers/training_args.py | 3 ++ tests/trainer/test_trainer.py | 44 ++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index a21c46fea9fe2a..5d005384de41d2 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -544,6 +544,9 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra # End training if state.global_step >= state.max_steps: control.should_training_stop = True + # Save the best model at the end if we have a save strategy + if args.save_strategy != IntervalStrategy.NO: + control.should_save = True return control diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 2807c9951aa6d6..5ae32c451d49fe 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -333,6 +333,9 @@ class TrainingArguments: - `"no"`: No save is done during training. - `"epoch"`: Save is done at the end of each epoch. - `"steps"`: Save is done every `save_steps`. + + If `"epoch"` or `"steps"` is chosen, saving will also be performed at the + very end of training, always. save_steps (`int` or `float`, *optional*, defaults to 500): Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index c420da4052f186..952dbb90d7f809 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1968,6 +1968,50 @@ def test_safe_checkpoints(self): tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors ) + def test_load_best_model_with_save(self): + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + output_dir=tmpdir, + save_steps=5, + evaluation_strategy="steps", + eval_steps=5, + ) + trainer.train() + # Check that we have the last known step: + assert os.path.exists( + os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}") + ), f"Could not find checkpoint-{trainer.state.max_steps}" + # And then check the last multiple + last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5 + assert os.path.exists( + os.path.join(tmpdir, f"checkpoint-{last_multiple}") + ), f"Could not find checkpoint-{last_multiple}" + + # Now test that using a limit works + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + output_dir=tmpdir, + save_steps=5, + evaluation_strategy="steps", + eval_steps=5, + load_best_model_at_end=True, + save_total_limit=2, + ) + trainer.train() + # Check that we have the last known step: + assert os.path.exists( + os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}") + ), f"Could not find checkpoint-{trainer.state.max_steps}" + # And then check the last multiple + last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5 + assert os.path.exists( + os.path.join(tmpdir, f"checkpoint-{last_multiple}") + ), f"Could not find checkpoint-{last_multiple}" + # Finally check that we don't have an old one + assert not os.path.exists( + os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps-10}") + ), f"Found checkpoint-{trainer.state.max_steps-10}, limit not respected" + @require_torch_multi_accelerator def test_run_seq2seq_double_train_wrap_once(self): # test that we don't wrap the model more than once From 57dbe85fb98d0ac24204bd3ed917e6902ee9636a Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 10 Apr 2024 08:53:54 -0400 Subject: [PATCH 2/6] Fix test --- tests/trainer/test_trainer_callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py index 8c0c9367d8d779..9eeb1d5e412e17 100644 --- a/tests/trainer/test_trainer_callback.py +++ b/tests/trainer/test_trainer_callback.py @@ -153,7 +153,7 @@ def get_expected_events(self, trainer): expected_events.append("on_log") if trainer.args.eval_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0: expected_events += evaluation_events.copy() - if step % trainer.args.save_steps == 0: + if step % trainer.args.save_steps == 0 or step == trainer.state.max_steps: expected_events.append("on_save") expected_events.append("on_epoch_end") if trainer.args.eval_strategy == IntervalStrategy.EPOCH: From b73f1a4c1f2ddbd4845c718a1d6581eb28478797 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 29 Apr 2024 13:28:37 -0400 Subject: [PATCH 3/6] Rework test --- tests/trainer/test_trainer.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 952dbb90d7f809..362c4a3bc17048 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -34,6 +34,7 @@ from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files from parameterized import parameterized from requests.exceptions import HTTPError +from safetensors.torch import load_file from transformers import ( AutoTokenizer, @@ -1975,19 +1976,21 @@ def test_load_best_model_with_save(self): save_steps=5, evaluation_strategy="steps", eval_steps=5, + max_steps=9, ) trainer.train() # Check that we have the last known step: assert os.path.exists( os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}") ), f"Could not find checkpoint-{trainer.state.max_steps}" - # And then check the last multiple - last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5 - assert os.path.exists( - os.path.join(tmpdir, f"checkpoint-{last_multiple}") - ), f"Could not find checkpoint-{last_multiple}" + # And then check the last step + assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9" # Now test that using a limit works + # Should result in: + # - save at step 5 (but is deleted) + # - save at step 10 (loaded in at the end when `load_best_model=True`) + # - save at step 11 with tempfile.TemporaryDirectory() as tmpdir: trainer = get_regression_trainer( output_dir=tmpdir, @@ -1996,21 +1999,23 @@ def test_load_best_model_with_save(self): eval_steps=5, load_best_model_at_end=True, save_total_limit=2, + max_steps=11, ) trainer.train() # Check that we have the last known step: - assert os.path.exists( - os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}") - ), f"Could not find checkpoint-{trainer.state.max_steps}" + assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11" # And then check the last multiple - last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5 - assert os.path.exists( - os.path.join(tmpdir, f"checkpoint-{last_multiple}") - ), f"Could not find checkpoint-{last_multiple}" + assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10" # Finally check that we don't have an old one - assert not os.path.exists( - os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps-10}") - ), f"Found checkpoint-{trainer.state.max_steps-10}, limit not respected" + assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected" + + # Finally check that the right model was loaded in, checkpoint-10 + # this goes by the last `eval` step check to do so, so it won't be + # the last model *saved* + model_state = trainer.model.state_dict() + final_model_weights = load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors")) + for k, v in model_state.items(): + assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same" @require_torch_multi_accelerator def test_run_seq2seq_double_train_wrap_once(self): From 370e6cfe8dba74ddfcb469b757334b12a233e407 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 29 Apr 2024 14:25:39 -0400 Subject: [PATCH 4/6] Fixup tests' --- tests/trainer/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 362c4a3bc17048..9e669dd53badea 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -34,7 +34,6 @@ from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files from parameterized import parameterized from requests.exceptions import HTTPError -from safetensors.torch import load_file from transformers import ( AutoTokenizer, @@ -128,6 +127,7 @@ if is_safetensors_available(): import safetensors.torch + # for version specific tests in TrainerIntegrationTest require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28") @@ -2013,7 +2013,7 @@ def test_load_best_model_with_save(self): # this goes by the last `eval` step check to do so, so it won't be # the last model *saved* model_state = trainer.model.state_dict() - final_model_weights = load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors")) + final_model_weights = safetensors.torch.load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors")) for k, v in model_state.items(): assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same" From 4002cb85b5f4bd84ceb445867c9b287d6edeed1e Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 29 Apr 2024 14:26:47 -0400 Subject: [PATCH 5/6] Update comment based on sourab feedback --- src/transformers/trainer_callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 5d005384de41d2..d2570ed8ba4317 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -544,7 +544,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra # End training if state.global_step >= state.max_steps: control.should_training_stop = True - # Save the best model at the end if we have a save strategy + # Save the model at the end if we have a save strategy if args.save_strategy != IntervalStrategy.NO: control.should_save = True From 95b018370433bf582e3d726f791829de81259355 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 15 May 2024 14:51:34 -0400 Subject: [PATCH 6/6] Clean --- tests/trainer/test_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9e669dd53badea..3aa95ad0e3ba87 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -127,7 +127,7 @@ if is_safetensors_available(): import safetensors.torch - + # for version specific tests in TrainerIntegrationTest require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28") @@ -2013,7 +2013,9 @@ def test_load_best_model_with_save(self): # this goes by the last `eval` step check to do so, so it won't be # the last model *saved* model_state = trainer.model.state_dict() - final_model_weights = safetensors.torch.load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors")) + final_model_weights = safetensors.torch.load_file( + os.path.join(tmpdir, "checkpoint-10", "model.safetensors") + ) for k, v in model_state.items(): assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"