linkedin · wheynelau · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/src/liger_kernel/ops/fused_linear_jsd.py b/src/liger_kernel/ops/fused_linear_jsd.py
@@ -27,6 +27,7 @@ def fused_linear_jsd_forward(
     ignore_index,
     has_label,
     temperature,
+    softcap=0.0,
 ):
     device = student_input.device
     dtype = student_input.dtype
@@ -79,6 +80,14 @@ def fused_linear_jsd_forward(
         teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(
             torch.float32
         )
+
+        if softcap > 0.0:
+            # need to store intermediate_student for backprop
+            intermediate_student = torch.tanh(student_logits_chunk / softcap)
+            student_logits_chunk = intermediate_student * softcap
+
+            teacher_logits_chunk = torch.tanh(teacher_logits_chunk / softcap) * softcap
+
         chunk_n_rows = student_logits_chunk.shape[0]
 
         # unreduced loss
@@ -125,6 +134,8 @@ def fused_linear_jsd_forward(
                 student_prob_chunk.shape
             )
         ) / temperature
+        if softcap > 0.0:
+            student_logits_chunk *= 1 - intermediate_student**2
         # now we traverse back to grad w.r.t. input to `lm_head` and grad
         # w.r.t. `lm_head` which should be computed in original dtype
         student_logits_chunk = student_logits_chunk.to(dtype)
@@ -193,6 +204,7 @@ def forward(
         jsd_beta: float = 0.5,
         ignore_index: int = -100,
         temperature: float = 1.0,
+        softcap: float = 0.0,
     ):
         """
         Args:
@@ -227,6 +239,7 @@ def forward(
             ignore_index,
             has_label,
             temperature,
+            softcap,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -242,4 +255,4 @@ def backward(ctx, grad_output):
         grad_input, grad_weight = fused_linear_jsd_backward(
             grad_output, grad_input, grad_weight
         )
-        return (grad_input, grad_weight, None, None, None, None, None, None)
+        return (grad_input, grad_weight, None, None, None, None, None, None, None)
diff --git a/src/liger_kernel/transformers/functional.py b/src/liger_kernel/transformers/functional.py
@@ -79,6 +79,7 @@ def liger_fused_linear_jsd(
     jsd_beta: float = 0.5,
     ignore_index: int = -100,
     temperature: float = 1.0,
+    softcap: Optional[float] = None,
 ):
     return LigerFusedLinearJSDFunction.apply(
         student_input,
@@ -89,6 +90,7 @@ def liger_fused_linear_jsd(
         jsd_beta,
         ignore_index,
         temperature,
+        softcap,
     )
 
 

diff --git a/src/liger_kernel/transformers/fused_linear_jsd.py b/src/liger_kernel/transformers/fused_linear_jsd.py
@@ -68,12 +68,13 @@ class LigerFusedLinearJSD(torch.nn.Module):
     ```
     """
 
-    def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0):
+    def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0, softcap=0.0):
         super().__init__()
         assert temperature != 0, "temperature cannot be 0."
         self.jsd_beta = jsd_beta
         self.temperature = temperature
         self.ignore_index = ignore_index
+        self.softcap = softcap
 
     def forward(
         self,
@@ -92,4 +93,5 @@ def forward(
             self.jsd_beta,
             self.ignore_index,
             self.temperature,
+            self.softcap,
         )
diff --git a/test/transformers/test_fused_linear_jsd.py b/test/transformers/test_fused_linear_jsd.py
@@ -29,6 +29,7 @@ def __init__(
         beta: float = 0.5,
         ignore_index: int = -100,
         temperature: float = 1.0,
+        softcap: float = 0.0,
     ):
         super().__init__()
         self.student_lin = torch.nn.Linear(
@@ -39,15 +40,26 @@ def __init__(
         )
         self.jsd = TorchJSD(beta=beta, ignore_index=ignore_index, dtype=dtype)
         self.temperature = temperature
+        self.softcap = softcap
 
     def forward(self, student_input, teacher_input, label=None):
         student_logits = self.student_lin(student_input).to(torch.float32)
         teacher_logits = self.teacher_lin(teacher_input).to(torch.float32)
+
+        if self.softcap > 0:
+            student_logits = self._softcap(student_logits)
+            teacher_logits = self._softcap(teacher_logits)
+
         student_prob = torch.log_softmax(student_logits / self.temperature, dim=-1)
         teacher_prob = torch.log_softmax(teacher_logits / self.temperature, dim=-1)
 
         return self.jsd(student_prob, teacher_prob, label)
 
+    def _softcap(self, logits):
+        logits = logits / self.softcap
+        logits = torch.tanh(logits) * self.softcap
+        return logits
+
 
 class LigerLMHeadJSD(torch.nn.Module):
     def __init__(
@@ -59,6 +71,7 @@ def __init__(
         beta: float = 0.5,
         ignore_index: int = -100,
         temperature: float = 1.0,
+        softcap: float = 0.0,
     ):
         super().__init__()
         self.student_lin = torch.nn.Linear(
@@ -68,7 +81,10 @@ def __init__(
             in_features=H, out_features=V, bias=False, dtype=dtype, device=device
         )
         self.fused_jsd = LigerFusedLinearJSD(
-            jsd_beta=beta, ignore_index=ignore_index, temperature=temperature
+            jsd_beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            softcap=softcap,
         )
 
     def forward(self, student_input, teacher_input, label=None):
@@ -109,7 +125,11 @@ def forward(self, student_input, teacher_input, label=None):
         (1.0, 1.0),  # RKL
     ],
 )
-def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
+@pytest.mark.parametrize(
+    "softcap",
+    [0.0, 30.0, 50.0],
+)
+def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, softcap, atol, rtol):
     device = "cuda"
     torch_lm_head_jsd = TorchLMHeadJSD(
         H=H,
@@ -118,6 +138,7 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
         device=device,
         temperature=temperature,
         beta=beta,
+        softcap=softcap,
     ).to(device)
     liger_lm_head_jsd = LigerLMHeadJSD(
         H=H,
@@ -126,6 +147,7 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
         device=device,
         temperature=temperature,
         beta=beta,
+        softcap=softcap,
     ).to(device)
 
     # init the linear in all FusedLinearJSDs with the same weights
@@ -184,8 +206,12 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
         (1.0, 1.0, 2),
     ],
 )
+@pytest.mark.parametrize(
+    "softcap",
+    [0.0, 30.0, 50.0],
+)
 def test_correctness_with_ignore_index(
-    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol
+    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, softcap, atol, rtol
 ):
     device = "cuda"
     torch_lm_head_jsd = TorchLMHeadJSD(
@@ -196,6 +222,7 @@ def test_correctness_with_ignore_index(
         temperature=temperature,
         ignore_index=ignore_index,
         beta=beta,
+        softcap=softcap,
     ).to(device)
     liger_lm_head_jsd = LigerLMHeadJSD(
         H=H,
@@ -205,6 +232,7 @@ def test_correctness_with_ignore_index(
         temperature=temperature,
         ignore_index=ignore_index,
         beta=beta,
+        softcap=softcap,
     ).to(device)
 
     # init the linear in all FusedLinearJSDs with the same weights
@@ -268,8 +296,12 @@ def test_correctness_with_ignore_index(
 @pytest.mark.parametrize(
     "temperature, beta, ignore_index", [(1.0, 0.5, -100), (2.0, 0.1, 42)]
 )
+@pytest.mark.parametrize(
+    "softcap",
+    [0.0, 30.0, 50.0],
+)
 def test_correctness_functional(
-    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol
+    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, softcap, atol, rtol
 ):
     device = "cuda"
 
@@ -304,6 +336,7 @@ def test_correctness_functional(
         jsd_beta=beta,
         ignore_index=ignore_index,
         temperature=temperature,
+        softcap=softcap,
     )
     output2 = LigerFusedLinearJSDFunction.apply(
         _input2,
@@ -314,6 +347,7 @@ def test_correctness_functional(
         beta,
         ignore_index,
         temperature,
+        softcap,
     )
 
     assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
@@ -347,8 +381,12 @@ def test_correctness_functional(
         (2.0, 0.1, 42),
     ],
 )
+@pytest.mark.parametrize(
+    "softcap",
+    [0.0, 30.0, 50.0],
+)
 def test_correctness_all_ignored(
-    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol
+    B, T, H, V, scalar, dtype, beta, ignore_index, temperature, softcap, atol, rtol
 ):
     device = "cuda"
     torch_lm_head_jsd = TorchLMHeadJSD(
@@ -359,6 +397,7 @@ def test_correctness_all_ignored(
         temperature=temperature,
         ignore_index=ignore_index,
         beta=beta,
+        softcap=softcap,
     ).to(device)
     liger_lm_head_jsd = LigerLMHeadJSD(
         H=H,
@@ -368,6 +407,7 @@ def test_correctness_all_ignored(
         temperature=temperature,
         ignore_index=ignore_index,
         beta=beta,
+        softcap=softcap,
     ).to(device)
 
     # init the linear in all FusedLinearJSDs with the same weights