fix bug in tl.store mask for kernel _to_fp8_row_major_t_and_non_t

ghstack-source-id: f95151dc71feb9c6af2788a3e2ccc65d08f4bff3 ghstack-comment-id: 2575984684 Pull Request resolved: #1516
pytorch · Jan 8, 2025 · 832bb97 · 832bb97
1 parent 953e1a6
commit 832bb97
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 3 deletions.
diff --git a/torchao/prototype/float8nocompile/kernels/fp8_dynamic_tensorwise.py b/torchao/prototype/float8nocompile/kernels/fp8_dynamic_tensorwise.py
@@ -375,8 +375,8 @@ def _to_fp8_row_major_t_and_non_t(
         block_col_offs[:, None] * row_major_t_out_stride_row
         + block_row_offs[None, :] * row_major_t_out_stride_col
     )
-    mask = (block_row_offs[:, None] < row_major_t_num_rows) & (
-        block_col_offs[None, :] < row_major_t_num_cols
+    mask = (block_col_offs[:, None] < row_major_t_num_rows) & (
+        block_row_offs[None, :] < row_major_t_num_cols
     )
     tl.store(row_major_t_out_ptr + row_major_t_offs, fp8_vals.trans(1, 0), mask=mask)
 

diff --git a/torchao/prototype/float8nocompile/test/train_test.py b/torchao/prototype/float8nocompile/test/train_test.py
@@ -36,7 +36,9 @@ def model2():
     return TestModel()
 
 
-@pytest.mark.parametrize("input_shape", [(16, 32), (1, 16, 32), (2, 16, 32)])
+@pytest.mark.parametrize(
+    "input_shape", [(16, 32), (1, 16, 32), (2, 16, 32), (128, 8192, 32)]
+)
 def test_model_weights_and_gradients(model1, model2, input_shape: tuple[int, int]):
     assert torch.cuda.is_available()
     device = torch.device("cuda")