stanford-crfm · Aphoh · Dec 12, 2024 · Dec 12, 2024 · Dec 17, 2024 · Dec 18, 2024
diff --git a/src/levanter/grad_accum.py b/src/levanter/grad_accum.py
@@ -1,3 +1,4 @@
+import abc
 import enum
 import functools
 from typing import Callable, Optional, ParamSpec, TypeVar
@@ -20,6 +21,12 @@
 R = TypeVar("R")
 
 
+class NumElementsBatch(abc.ABC):
+    @abc.abstractmethod
+    def num_elements(self) -> int:
+        pass
+
+
 class ReductionType(enum.Enum):
     SUM = enum.auto()
     MEAN = enum.auto()

diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
@@ -11,6 +11,7 @@
 import haliax as hax
 from haliax import Axis, NamedArray, NamedOrNumeric
 
+from levanter.grad_accum import NumElementsBatch
 from levanter.models.attention import AttentionMask
 from levanter.models.loss import maybe_fused_next_token_loss
 
@@ -19,7 +20,7 @@
 LmT = TypeVar("LmT", bound="LmHeadModel")
 
 
-class LmExample(eqx.Module):
+class LmExample(eqx.Module, NumElementsBatch):
     tokens: hax.NamedArray
     loss_mask: hax.NamedArray
     attn_mask: AttentionMask | NamedArray = AttentionMask.causal()
@@ -88,6 +89,9 @@ def from_prompt_and_completion(
 
         return LmExample(tokens=tokens, loss_mask=loss_mask, attn_mask=attn_mask)
 
+    def num_elements(self):
+        return self.loss_mask.sum()
+
 
 # TODO: for some reason, mypy doesn't like the discover_packages_path argument?
 @dataclass(frozen=True)
@@ -221,6 +225,7 @@ def compute_next_token_loss(
     key=None,
     reduction: Optional[hax.ReductionFunction] = hax.mean,
     reduction_axis: Optional[hax.AxisSelection] = None,
+    batch_num_elements: Optional[int] = None,
     logsumexp_weight: Optional[float] = None,
     loss_dtype: Optional[Type[jnp.dtype]] = jnp.float32,
 ) -> jnp.ndarray | NamedArray:
@@ -241,6 +246,7 @@ def compute_next_token_loss(
         loss_mask=example.loss_mask,
         reduction=reduction,
         reduction_axis=reduction_axis,
+        batch_num_elements=batch_num_elements,
         logsumexp_weight=logsumexp_weight,
         dtype=loss_dtype,
         block_size=model.config.cross_entropy_block_size,

diff --git a/src/levanter/models/loss.py b/src/levanter/models/loss.py
@@ -1,4 +1,5 @@
 import functools
+import logging
 from typing import Optional
 
 import equinox
@@ -10,6 +11,9 @@
 from haliax.nn import cross_entropy_loss_and_log_normalizers
 
 
+logger = logging.getLogger(__name__)
+
+
 def maybe_fused_next_token_loss(
     Pos: hax.AxisSelector,
     Embed: hax.AxisSelector,
@@ -20,6 +24,7 @@ def maybe_fused_next_token_loss(
     loss_mask: Optional[NamedArray] = None,
     reduction: Optional[hax.ReductionFunction] = hax.mean,
     reduction_axis: Optional[hax.AxisSelection] = None,
+    batch_num_elements: Optional[int] = None,
     logsumexp_weight: Optional[float] = None,
     block_size: Optional[int] = None,
     dtype: Optional[jnp.dtype] = jnp.float32,
@@ -36,6 +41,7 @@ def maybe_fused_next_token_loss(
         loss_mask (Optional[NamedArray]): Mask to apply to the loss.
         reduction (Optional[hax.ReductionFunction]): Reduction function.
         reduction_axis (Optional[hax.AxisSelection]): Axis to apply reduction.
+        batch_num_elements (Optional[int]): The number of elements in the batch. When passed, it is used to reduce the loss.
         logsumexp_weight (Optional[float]): Weight for logsumexp penalty.
         block_size (Optional[int]): Size of each block for processing.
 
@@ -45,6 +51,9 @@ def maybe_fused_next_token_loss(
     # Resolve axes
     Pos = pred_embeddings.resolve_axis(Pos)
     Vocab = pred_lm_head.resolve_axis(Vocab)
+    if batch_num_elements is not None:
+        if reduction is not hax.sum:
+            logger.warning("batch_num_elements given when reduction is not hax.sum, make sure this is intended")
 
     if block_size is None:
         # Full softmax computation
@@ -53,32 +62,36 @@ def maybe_fused_next_token_loss(
             logits = logits.astype(dtype)
 
         # Shift target tokens to predict the next token
-        return next_token_loss(Pos, Vocab, logits, true_ids, loss_mask, reduction, reduction_axis, logsumexp_weight)
-
-    # Shift target tokens to predict the next token
-    target_y = hax.roll(true_ids, -1, Pos)
-
-    # Create a mask that excludes the last token
-    not_last_loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jnp.float32)  # type: ignore
-    if loss_mask is not None:
-        loss_mask = loss_mask * not_last_loss_mask
+        loss = next_token_loss(Pos, Vocab, logits, true_ids, loss_mask, reduction, reduction_axis, logsumexp_weight)
     else:
-        loss_mask = not_last_loss_mask
+        # Shift target tokens to predict the next token
+        target_y = hax.roll(true_ids, -1, Pos)
 
-    # Compute the loss with optional block-wise processing
-    return fused_cross_entropy_loss_and_logsumexp_penalty(
-        pred_embeddings,
-        pred_lm_head,
-        Contract=Embed,
-        Label=Vocab,
-        target_y=target_y,
-        reduction=reduction,
-        reduction_axis=reduction_axis,
-        where=loss_mask,
-        logsumexp_weight=logsumexp_weight,
-        block_size=block_size,
-        dtype=dtype,
-    )
+        # Create a mask that excludes the last token
+        not_last_loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jnp.float32)  # type: ignore
+        if loss_mask is not None:
+            loss_mask = loss_mask * not_last_loss_mask
+        else:
+            loss_mask = not_last_loss_mask
+
+        # Compute the loss with optional block-wise processing
+        loss = fused_cross_entropy_loss_and_logsumexp_penalty(
+            pred_embeddings,
+            pred_lm_head,
+            Contract=Embed,
+            Label=Vocab,
+            target_y=target_y,
+            reduction=reduction,
+            reduction_axis=reduction_axis,
+            where=loss_mask,
+            logsumexp_weight=logsumexp_weight,
+            block_size=block_size,
+            dtype=dtype,
+        )
+
+    if batch_num_elements is not None:
+        return loss / batch_num_elements
+    return loss
 
 
 def next_token_loss(

diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
@@ -37,7 +37,7 @@
 from levanter.config import JsonAtom
 from levanter.data import AsyncDataset, DataLoader
 from levanter.distributed import DistributedConfig, RayConfig
-from levanter.grad_accum import microbatched
+from levanter.grad_accum import NumElementsBatch, ReductionType, microbatched
 from levanter.tracker import TrackerConfig, capture_time
 from levanter.trainer_state import TrainerState, saveable_training_mask
 from levanter.utils import cloud_utils, fsspec_utils
@@ -380,7 +380,7 @@ def checkpoint_path(self) -> str:
             checkpoint_path = self.config.checkpointer.expanded_path(self.run_id)
         return checkpoint_path
 
-    def train_step(self, state: S, *batch: X, **batch_kwargs) -> StepInfo[S]:
+    def train_step(self, state: S, batch: X, **batch_kwargs) -> StepInfo[S]:
         """
         Performs a single training step.
         """
@@ -529,7 +529,7 @@ def _train_step(
         key, new_key = jax.random.split(state.training_key)
         model = inference_mode(state.model, False)
 
-        loss, grads = self._compute_gradients_microbatched(self.loss_fn, model, *batch, **batch_kwargs, key=key)
+        loss, grads = self._compute_gradients_microbatched(self.loss_fn, model, batch, **batch_kwargs, key=key)
 
         with hax.axis_mapping(self.parameter_axis_mapping):
             if not _no_hooks:
@@ -549,18 +549,28 @@ def obj_fun(trainable_model):
         else:
             return loss, new_state, hook_infos
 
-    def _compute_gradients_microbatched(self, loss_fn, model: M, *batch, **batch_kwargs) -> tuple[Scalar, M]:
+    def _compute_gradients_microbatched(self, loss_fn, model: M, batch: X, **batch_kwargs) -> tuple[Scalar, M]:
         grad_fn = eqx.filter_value_and_grad(loss_fn, has_aux=False)
         mbs = self.config.microbatch_size
+        reduce = ReductionType.MEAN
+        if isinstance(batch, NumElementsBatch) and mbs != self.TrainBatch.size:
+            batch_kwargs[
+                "batch_num_elements"
+            ] = batch.num_elements()  # tell the loss function how many elements are in the batch
+            batch_kwargs[
+                "reduction"
+            ] = hax.sum  # the loss fn should sum the loss and divide by the number of elements, not average
+            reduce = ReductionType.SUM  # we're already normalizing the loss
         grad_fn = microbatched(
             grad_fn,
             self.TrainBatch,
             mbs,
             self.parameter_axis_mapping,
             self.compute_axis_mapping,
+            reduce=reduce,
         )
         with hax.axis_mapping(self.compute_axis_mapping):
-            return grad_fn(model, *batch, **batch_kwargs)
+            return grad_fn(model, batch, **batch_kwargs)
 
 
 def _initialize_global_tracker(config, run_id):

diff --git a/src/levanter/utils/types.py b/src/levanter/utils/types.py
@@ -51,9 +51,10 @@ class ComputeLossFunction(Protocol[M_con, X]):
     def __call__(
         self,
         model: M_con,
-        *inputs: X,
+        input: X,
         reduction: Optional[hax.ReductionFunction] = hax.mean,
         reduction_axis: Optional[hax.AxisSelection] = None,
+        batch_num_elements: Optional[int] = None,
         **kwargs,
     ) -> Scalar | hax.NamedArray:
         ...