diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 905744a64ed4c6..0b2a88cb803dad 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1734,6 +1734,15 @@ def _inner_training_loop( ) elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) + + # deepspeed ckpt loading + if resume_from_checkpoint is not None and self.is_deepspeed_enabled: + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint, load_module_strict=False) + if self.args.deepspeed_force_lr_scheduler_checkpointing and self.model_wrapped.lr_scheduler is None: + if os.path.isfile(os.path.join(resume_from_checkpoint, SCHEDULER_NAME)): + with warnings.catch_warnings(record=True) as caught_warnings: + self.lr_scheduler.load_state_dict(torch.load(os.path.join(resume_from_checkpoint, SCHEDULER_NAME))) + reissue_pt_warnings(caught_warnings) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) @@ -2416,6 +2425,12 @@ def _save_checkpoint(self, model, trial, metrics=None): else: staging_output_dir = os.path.join(run_dir, f"tmp-{checkpoint_folder}") self.save_model(staging_output_dir, _internal_call=True) + if self.is_deepspeed_enabled: + # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed + # config `stage3_gather_16bit_weights_on_model_save` is True + self.model_wrapped.save_checkpoint(staging_output_dir) + if self.args.deepspeed_force_lr_scheduler_checkpointing and self.model_wrapped.lr_scheduler is None: + torch.save(self.lr_scheduler.state_dict(), os.path.join(staging_output_dir, SCHEDULER_NAME)) if not self.args.save_only_model: # Save optimizer and scheduler diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 56f102396e0fe5..47aa4d11b472d6 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1316,6 +1316,18 @@ class TrainingArguments: "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes." }, ) + + deepspeed_force_lr_scheduler_checkpointing: bool = field( + default=False, + metadata={ + "help": ( + "Force saving and loading or checkpointing the lr_scheduler when deepspeed is enabled and it does not " + "support the lr_scheduler type. " + "Use this to force keeping track of lr_scheduler when the model lr_scheduler type does not fall into " + "its supported lr_scheduler categories." + ) + }, + ) def __post_init__(self): # expand paths, if not os.makedirs("~/bar") will make directory