Skip to content

Commit

Permalink
Rework test
Browse files Browse the repository at this point in the history
  • Loading branch information
muellerzr committed Apr 29, 2024
1 parent 71be27e commit 7e405f7
Showing 1 changed file with 20 additions and 15 deletions.
35 changes: 20 additions & 15 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files
from parameterized import parameterized
from requests.exceptions import HTTPError
from safetensors.torch import load_file

from transformers import (
AutoTokenizer,
Expand Down Expand Up @@ -1778,19 +1779,21 @@ def test_load_best_model_with_save(self):
save_steps=5,
evaluation_strategy="steps",
eval_steps=5,
max_steps=9,
)
trainer.train()
# Check that we have the last known step:
assert os.path.exists(
os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
), f"Could not find checkpoint-{trainer.state.max_steps}"
# And then check the last multiple
last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
assert os.path.exists(
os.path.join(tmpdir, f"checkpoint-{last_multiple}")
), f"Could not find checkpoint-{last_multiple}"
# And then check the last step
assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"

# Now test that using a limit works
# Should result in:
# - save at step 5 (but is deleted)
# - save at step 10 (loaded in at the end when `load_best_model=True`)
# - save at step 11
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(
output_dir=tmpdir,
Expand All @@ -1799,21 +1802,23 @@ def test_load_best_model_with_save(self):
eval_steps=5,
load_best_model_at_end=True,
save_total_limit=2,
max_steps=11,
)
trainer.train()
# Check that we have the last known step:
assert os.path.exists(
os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
), f"Could not find checkpoint-{trainer.state.max_steps}"
assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
# And then check the last multiple
last_multiple = trainer.state.max_steps - trainer.state.max_steps % 5
assert os.path.exists(
os.path.join(tmpdir, f"checkpoint-{last_multiple}")
), f"Could not find checkpoint-{last_multiple}"
assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
# Finally check that we don't have an old one
assert not os.path.exists(
os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps-10}")
), f"Found checkpoint-{trainer.state.max_steps-10}, limit not respected"
assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"

# Finally check that the right model was loaded in, checkpoint-10
# this goes by the last `eval` step check to do so, so it won't be
# the last model *saved*
model_state = trainer.model.state_dict()
final_model_weights = load_file(os.path.join(tmpdir, "checkpoint-10", "model.safetensors"))
for k, v in model_state.items():
assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"

@require_torch_multi_accelerator
def test_run_seq2seq_double_train_wrap_once(self):
Expand Down

0 comments on commit 7e405f7

Please sign in to comment.