Add support for Marlin 2:4 sparsity (#2102)

This change adds support for 2:4 sparsity when using Marlin quantization. The 2:4 kernel is used when: * The quantizer is `marlin`; * the quantizer checkpoint format is `marlin_24`. Fixes #2098.
huggingface · Jun 25, 2024 · f1f98e3 · f1f98e3
1 parent 14980df
commit f1f98e3
Show file tree

Hide file tree

Showing 11 changed files with 1,731 additions and 16 deletions.
diff --git a/server/marlin/marlin_kernels/__init__.pyi b/server/marlin/marlin_kernels/__init__.pyi
@@ -19,6 +19,23 @@ def gptq_marlin_gemm(
     """
     ...
 
+def gptq_marlin_24_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_meta: torch.Tensor,
+    b_scales: torch.Tensor,
+    workspace: torch.Tensor,
+    num_bits: int,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+) -> torch.Tensor:
+    """
+    Matrix multiplication using Marlin kernels. This is an extension of
+    `marlin_gemm` that supports 2:4 sparsity.
+    """
+    ...
+
 def gptq_marlin_repack(
     b_q_weight: torch.Tensor,
     perm: torch.Tensor,

diff --git a/server/marlin/marlin_kernels/ext.cpp b/server/marlin/marlin_kernels/ext.cpp
@@ -5,6 +5,7 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
         "Marlin gemm with GPTQ compatibility");
+  m.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm, "Marlin sparse 2:4 gemm");
   m.def("gptq_marlin_repack", &gptq_marlin_repack,
         "Repack GPTQ parameters for Marlin");
   m.def("marlin_gemm", &marlin_gemm, "Marlin gemm");

diff --git a/server/marlin/marlin_kernels/ext.hh b/server/marlin/marlin_kernels/ext.hh
@@ -12,6 +12,13 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                                int64_t num_bits, int64_t size_m, int64_t size_n,
                                int64_t size_k, bool is_k_full);
 
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                                  torch::Tensor &b_meta,
+                                  torch::Tensor &b_scales,
+                                  torch::Tensor &workspace, int64_t num_bits,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k);
+
 torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);

diff --git a/server/marlin/marlin_kernels/sparse/common/base.h b/server/marlin/marlin_kernels/sparse/common/base.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro ([email protected]). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace marlin_24 {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+template <int M_, int N_, int K_>
+struct ShapeBase {
+  static constexpr int M = M_, N = N_, K = K_;
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragM = Vec<uint, 1>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/common/mem.h b/server/marlin/marlin_kernels/sparse/common/mem.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro ([email protected]). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+
+namespace marlin_24 {
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
+                                            const void* glob_ptr,
+                                            bool pred = true,
+                                            const bool zfill = false) {
+  const int BYTES = 16;
+  int src_in_bytes = (zfill ? 0 : BYTES);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+      : "r"(smem));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+}  // namespace marlin_24