diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5957f8025d2a0b..c2327739549e5e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3700,7 +3700,7 @@ def training_step( else: # Finally we need to normalize the loss for reporting if num_items_in_batch is None: - loss /= self.args.gradient_accumulation_steps + loss = loss / self.args.gradient_accumulation_steps self.accelerator.backward(loss, **kwargs)