Skip to content

Commit

Permalink
Clean up fit_start & callback rank 0 only
Browse files Browse the repository at this point in the history
  • Loading branch information
joyce-chen-uni committed Aug 26, 2024
1 parent 5b044fa commit b521b94
Showing 1 changed file with 7 additions and 12 deletions.
19 changes: 7 additions & 12 deletions llmfoundry/callbacks/kill_loss_spike_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torch
from composer.core import Callback, State, TimeUnit
from composer.loggers import Logger, MosaicMLLogger
from composer.utils import dist

from llmfoundry.utils.exceptions import HighLossError, LossSpikeError
from llmfoundry.utils.warnings import experimental_class
Expand Down Expand Up @@ -56,7 +57,7 @@ def __init__(
patience: int = 4,
outlier_multiplier: float = 2,
):
# self._enabled = (dist.get_global_rank() == 0)
self._enabled = True
self.log_only = log_only
self.patience = patience
self.outlier_multiplier = outlier_multiplier
Expand Down Expand Up @@ -139,21 +140,15 @@ def fit_start(self, state: State, logger: Logger) -> None:
# Set the window size to a fraction of the total number of training batches for the run, minimum 100 batches.
if state.max_duration is not None:
if state.max_duration.unit == TimeUnit.EPOCH and state.dataloader_len is not None:
self.window_size = max(
self.window_size,
round(
float(
state.dataloader_len * state.max_duration.value *
WINDOW_FRACTION,
),
),
)
total_training_steps = state.dataloader_len * state.max_duration.value
elif state.max_duration.unit == TimeUnit.BATCH or state.max_duration.unit == TimeUnit.TOKEN:
self.window_size = max(
total_training_steps = state.max_duration.value
self.window_size = max(
self.window_size,
round(float(state.max_duration.value * WINDOW_FRACTION)),
round(float(total_training_steps * WINDOW_FRACTION)),
)
self.loss_window = deque(maxlen=self.window_size)
self._enabled = (dist.get_global_rank() == 0)

def batch_end(self, state: State, logger: Logger) -> None:

Expand Down

0 comments on commit b521b94

Please sign in to comment.