coreylowman · coreylowman · Nov 6, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/src/cublaslt/result.rs b/src/cublaslt/result.rs
@@ -61,6 +61,18 @@
     }
 }
 
+/// Sets the value of the specified attribute belonging to a previously created matrix layout
+/// descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatrixlayoutsetattribute)
+pub unsafe fn set_matrix_layout_attribute(
+    matrix_layout: sys::cublasLtMatrixLayout_t,
+    attr: sys::cublasLtMatrixLayoutAttribute_t,
+    buf: *const c_void,
+    buf_size: usize,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatrixLayoutSetAttribute(matrix_layout, attr, buf, buf_size).result()
+}
+
 /// Destroys a matrix layout previously created with [create_matrix_layout(...)]. See
 /// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatrixlayoutdestroy)
 ///
@@ -90,7 +102,7 @@
 /// Sets the value of the specified attribute belonging to a previously created matrix multiply
 /// descriptor. See
 /// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmuldescsetattribute)
 pub unsafe fn set_matmul_desc_attribute(
    matmul_desc: sys::cublasLtMatmulDesc_t,
    attr: sys::cublasLtMatmulDescAttributes_t,
    buf: *const c_void,
@@ -124,7 +136,7 @@
 /// Sets the value of the specified attribute belonging to a previously create matrix multiply
 /// preferences descriptor. See
 /// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmulpreferencesetattribute)
 pub unsafe fn set_matmul_pref_attribute(
    matmul_pref: sys::cublasLtMatmulPreference_t,
    attr: sys::cublasLtMatmulPreferenceAttributes_t,
    buf: *const c_void,
@@ -163,13 +175,13 @@

    unsafe {
        sys::cublasLtMatmulAlgoGetHeuristic(
            handle,
            matmul_desc,
            a_layout,
            b_layout,
            c_layout,
            d_layout,
            matmul_pref,
            1, // only select the fastest algo
            matmul_heuristic.as_mut_ptr(),
            &mut algo_count,

diff --git a/src/cublaslt/safe.rs b/src/cublaslt/safe.rs
@@ -1,8 +1,10 @@
 //! Safe abstractions around [crate::cublaslt::result] for doing matmul.
 
 use super::{result, result::CublasError, sys};
+use crate::cublaslt::result::set_matrix_layout_attribute;
 use crate::driver::sys::{CUdevice_attribute, CUdeviceptr, CUstream};
 use crate::driver::{CudaDevice, CudaSlice, DevicePtr, DevicePtrMut, DriverError};
+use core::ffi::c_int;
 use core::mem;
 use std::sync::Arc;
 
@@ -108,6 +110,11 @@ pub struct MatmulConfig {
     pub ldb: i64,
     pub beta: f32,
     pub ldc: i64,
+    pub stride_a: Option<i64>,
+    pub stride_b: Option<i64>,
+    pub stride_c: Option<i64>,
+    pub stride_bias: Option<i64>,
+    pub batch_size: Option<c_int>,
 }
 
 /// Matrix matrix multiplication with elements of type `T`.
@@ -146,8 +153,58 @@ pub trait Matmul<T>: MatmulShared {
 
         // Creates matrix layouts
         let a_layout = result::create_matrix_layout(Self::matrix_type(), a_rows, a_cols, cfg.lda)?;
+        if let (Some(batch_size), Some(stride_a)) = (cfg.batch_size, cfg.stride_a) {
+            // Set batch size
+            set_matrix_layout_attribute(
+                a_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                (&batch_size) as *const _ as *const _,
+                mem::size_of::<c_int>(),
+            )?;
+            // Set batch stride
+            set_matrix_layout_attribute(
+                a_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                (&stride_a) as *const _ as *const _,
+                mem::size_of::<i64>(),
+            )?;
+        }
+
         let b_layout = result::create_matrix_layout(Self::matrix_type(), b_rows, b_cols, cfg.ldb)?;
+        if let (Some(batch_size), Some(stride_b)) = (cfg.batch_size, cfg.stride_b) {
+            // Set batch size
+            set_matrix_layout_attribute(
+                b_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                (&batch_size) as *const _ as *const _,
+                mem::size_of::<c_int>(),
+            )?;
+            // Set batch stride
+            set_matrix_layout_attribute(
+                b_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                (&stride_b) as *const _ as *const _,
+                mem::size_of::<i64>(),
+            )?;
+        }
+
         let c_layout = result::create_matrix_layout(Self::matrix_type(), cfg.m, cfg.n, cfg.ldc)?;
+        if let (Some(batch_size), Some(stride_c)) = (cfg.batch_size, cfg.stride_c) {
+            // Set batch size
+            set_matrix_layout_attribute(
+                c_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                (&batch_size) as *const _ as *const _,
+                mem::size_of::<c_int>(),
+            )?;
+            // Set batch stride
+            set_matrix_layout_attribute(
+                c_layout,
+                sys::cublasLtMatrixLayoutAttribute_t::CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                (&stride_c) as *const _ as *const _,
+                mem::size_of::<i64>(),
+            )?;
+        }
 
         // Matmul description
         let matmul_desc =
@@ -189,6 +246,16 @@ pub trait Matmul<T>: MatmulShared {
                 bias.device_ptr() as *const CUdeviceptr as *const _,
                 mem::size_of::<CUdeviceptr>(),
             )?;
+
+            if let Some(stride_bias) = cfg.stride_bias {
+                // Set bias batch stride
+                result::set_matmul_desc_attribute(
+                    matmul_desc,
+                    sys::cublasLtMatmulDescAttributes_t::CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE,
+                    (&stride_bias) as *const _ as *const _,
+                    mem::size_of::<i64>(),
+                )?;
+            }
             epilogue
         } else if let Some(act) = act {
             // Only Act
@@ -388,6 +455,11 @@ mod tests {
                     ldb: K as i64,
                     beta: 0.0,
                     ldc: N as i64,
+                    stride_a: None,
+                    stride_b: None,
+                    stride_c: None,
+                    stride_bias: None,
+                    batch_size: None,
                 },
                 &b_dev,
                 &a_dev,
@@ -503,6 +575,11 @@ mod tests {
                     ldb: K as i64,
                     beta: 0.0,
                     ldc: N as i64,
+                    stride_a: None,
+                    stride_b: None,
+                    stride_c: None,
+                    stride_bias: None,
+                    batch_size: None,
                 },
                 &b_dev,
                 &a_dev,
@@ -552,6 +629,11 @@ mod tests {
                     ldb: K as i64,
                     beta: 0.0,
                     ldc: N as i64,
+                    stride_a: None,
+                    stride_b: None,
+                    stride_c: None,
+                    stride_bias: None,
+                    batch_size: None,
                 },
                 &b_dev,
                 &a_dev,