diff --git a/llmfoundry/callbacks/kill_loss_spike_callback.py b/llmfoundry/callbacks/kill_loss_spike_callback.py index 33786fc0de..20a08b9faf 100644 --- a/llmfoundry/callbacks/kill_loss_spike_callback.py +++ b/llmfoundry/callbacks/kill_loss_spike_callback.py @@ -59,9 +59,9 @@ def __init__( self.patience = patience self.outlier_multiplier = outlier_multiplier self.outlier_counter = 0 - self.window_size = None - self.loss_window = None - self.loss_cap = None + self.window_size = MIN_WINDOW_SIZE + self.loss_window = deque(maxlen=self.window_size) + self.loss_cap = float('inf') def detect_loss_spike(self, train_loss: float, running_loss_avg: float): # Train loss is an outlier @@ -95,16 +95,17 @@ def detect_high_losses(self, current_step: int): def fit_start(self, state: State, logger: Logger) -> None: # Set the window size to a fraction of the total number of training batches for the run, minimum 100 batches. - if state.max_duration.unit == TimeUnit.EPOCH: - self.window_size = max( - MIN_WINDOW_SIZE, - round(float(state.dataloader_len * state.max_duration.value / 20)), - ) - elif state.max_duration.unit == TimeUnit.BATCH or state.max_duration.unit == TimeUnit.TOKEN: - self.window_size = max( - MIN_WINDOW_SIZE, - round(float(state.max_duration.value / 20)), - ) + if state.max_duration is not None: + if state.max_duration.unit == TimeUnit.EPOCH: + self.window_size = max( + MIN_WINDOW_SIZE, + round(float(state.dataloader_len * state.max_duration.value / 20)), + ) + elif state.max_duration.unit == TimeUnit.BATCH or state.max_duration.unit == TimeUnit.TOKEN: + self.window_size = max( + MIN_WINDOW_SIZE, + round(float(state.max_duration.value / 20)), + ) self.loss_window = deque(maxlen=self.window_size) def batch_end(self, state: State, logger: Logger) -> None: