Skip to content

Commit

Permalink
int8 matmul backend (experimental)
Browse files Browse the repository at this point in the history
  • Loading branch information
mobicham committed Mar 19, 2024
1 parent 5ada837 commit df43514
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions hqq/core/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ class HQQBackend(Enum):
PYTORCH_FORWARD_COMPILE = "forward_pytorch_compile"
ATEN_FORWARD = "forward_aten"

# Experimental
ATEN_FORWARD_INT8 = "forward_aten_int8"


# No cache: less memory, slower
class HQQMatmulNoCacheDeq(torch.autograd.Function):
Expand Down Expand Up @@ -789,6 +792,27 @@ def forward_aten(self, x: Tensor) -> Tensor:
def forward_aten_backprop(self, x: Tensor) -> Tensor:
return HQQMatmulNoCacheDeq.apply(x, self.dequantize_aten, self.bias)

# TODO: as fused kernel in CUDA
def _get_int8_matrix(self, M):
scale = torch.abs(M).amax() / 127.0
return torch.round(M / scale).to(torch.int8), scale.float()

# TODO: in ATEN
@torch.compile()
def _matmul_int8(self, A, B):
dtype = A.dtype
A, scale_A = self._get_int8_matrix(A)
B, scale_B = self._get_int8_matrix(B)
return (torch._int_mm(A, B) * (scale_A * scale_B)).to(dtype)

def forward_aten_int8(self, x: Tensor) -> Tensor:
W_est = self.dequantize_aten()
out = self._matmul_int8(x[0], W_est.t())[None, ...]
if self.bias is not None:
out += self.bias

return out


def hqq_base_quant_config(
nbits: int = 4,
Expand Down

0 comments on commit df43514

Please sign in to comment.