From ac51e59e4755a4c7f69458be67737dd92b86b316 Mon Sep 17 00:00:00 2001 From: xkszltl Date: Mon, 5 Feb 2024 18:21:50 -0800 Subject: [PATCH] Do not use mtime for checkpoint rotation. (#28862) Resolve https://github.com/huggingface/transformers/issues/26961 --- src/transformers/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index fa508a35077038..74e484acacde3f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2465,7 +2465,9 @@ def _save_checkpoint(self, model, trial, metrics=None): # Maybe delete some older checkpoints. if self.args.should_save: - self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) + # Solely rely on numerical checkpoint id for rotation. + # mtime is not reliable especially on some fuse fs in cloud environments. + self._rotate_checkpoints(use_mtime=False, output_dir=run_dir) self.args.distributed_state.wait_for_everyone()