From 738d7f2c5a48367822f2bf9d538160d19f02341e Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Wed, 14 Aug 2024 15:45:44 +0000 Subject: [PATCH] [Feat]: Add support for kleidiai quantization schemes Description: 1. Allow Int4WeightOnlyQuantizer to work with channelwise and groupwise symmetric quantization schemes 2. KleidiAI supports channelwise and 32 groupwise quantized matmul kernels Signed-off-by: Nikhil Gupta --- torchao/quantization/GPTQ.py | 319 +++++++++++++++++++++++++++------- torchao/quantization/utils.py | 95 ++++++++++ 2 files changed, 354 insertions(+), 60 deletions(-) diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py index cb7c8d0481..7099dae136 100644 --- a/torchao/quantization/GPTQ.py +++ b/torchao/quantization/GPTQ.py @@ -36,6 +36,7 @@ groupwise_affine_quantize_tensor_from_qparams, pack_tinygemm_scales_and_zeros, per_token_dynamic_quant, + prepare_int4_weight_and_scales_and_zeros, ) aten = torch.ops.aten @@ -533,6 +534,7 @@ def linear_forward_int4( weight_int4pack: torch.Tensor, scales_and_zeros: torch.Tensor, out_features: int, + in_features: int, groupsize: int, precision: torch.dtype = torch.bfloat16, scales_precision: torch.dtype = torch.bfloat16, @@ -558,6 +560,136 @@ def linear_forward_int4( return c +def linear_forward_int4_dynamic_quantization_4bit( + x: torch.Tensor, + weight_int4pack: torch.Tensor, + scales_and_zeros: torch.Tensor, + out_features: int, + in_features: int, + groupsize: int, + precision: torch.dtype = torch.bfloat16, + scales_precision: torch.dtype = torch.bfloat16, +): + origin_x_size = x.size() + x = x.reshape(-1, origin_x_size[-1]) + c = torch.ops.aten._dyn_quant_matmul_4bit( + x.to(precision), weight_int4pack, groupsize, in_features, out_features) + new_shape = origin_x_size[:-1] + (out_features,) + c = c.reshape(new_shape) + return c + +def kai_roundup(a: int, b: int) -> int: + return ((a + b - 1) // b) * b + + +def get_kai_packed_weight_size(n_bits, N, K, groupsize): + if n_bits == 4: + if groupsize == K: # channelwise + # dotprod params only [1x8x32_neon_dotprod] + kai_nr = 8 + kai_kr = 16 + kai_sr = 2 + kai_num_bytes_sum_rhs = 4 # sizeof(int32_t) + kai_num_bytes_multiplier_rhs = 4 # sizeof(float) + kai_num_bytes_bias = 4 # sizeof(float) + + def kai_k_roundedup(k, kr, sr): + # Since we pack a float and int32 value at the end of the row, + # we must make sure that k is a multiple of 4 for alignment + kr_sr_roundedup4 = kai_roundup(kr * sr, 4) + return kai_roundup(k, kr_sr_roundedup4) + + def kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( + k, nr, kr, sr + ): + k_internal = kai_k_roundedup(k, kr, sr) + + assert (k_internal % 2) == 0, "k_internal must be even" + + return nr * ( + (k_internal // 2) + + kai_num_bytes_multiplier_rhs + + kai_num_bytes_sum_rhs + + kai_num_bytes_bias + ) + + def kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( + n, k, nr, kr, sr + ): + num_rows = kai_roundup(n, nr) // nr + + return ( + num_rows + * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( + k, nr, kr, sr + ) + ) + + return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( + N, K, kai_nr, kai_kr, kai_sr + ) + elif groupsize % 32 == 0 and K % groupsize == 0: # groupwise + kai_nr = 8 + kai_kr = 16 + kai_sr = 2 + kai_num_bytes_sum_rhs = 4 + kai_num_bytes_bias = 4 + kai_nr_multiple_of = 4 + kai_bl_multiple_of = 32 + + def kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + n, k, nr, kr, sr, bl + ): + assert (bl % kr) == 0 + assert (nr % kai_nr_multiple_of) == 0 + assert (bl % kai_bl_multiple_of) == 0 + + num_rows = kai_roundup(n, nr) // nr + + return ( + num_rows + * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + k, nr, kr, sr, bl + ) + ) + + def kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + k, nr, kr, sr, bl + ): + assert (bl % kr) == 0 + assert (nr % kai_nr_multiple_of) == 0 + assert (bl % kai_bl_multiple_of) == 0 + + # kr and sr are unused in the calculation + num_bytes_multiplier_rhs = kai_get_bf16_datatype_size_in_bytes() + num_blocks_per_row = kai_num_blocks_per_row(k, bl) + num_bytes_per_block = kai_num_bytes_per_block( + bl, num_bytes_multiplier_rhs + ) + + return nr * ( + (num_bytes_per_block * num_blocks_per_row) + + kai_num_bytes_sum_rhs + + kai_num_bytes_bias + ) + + # This funtion retuns size of these datatypes stored as enum. We modify it to just return bf16 datatype + # https://gitlab.arm.com/kleidi/kleidiai/-/blob/main/kai/kai_common.h?ref_type=heads#L55 + def kai_get_bf16_datatype_size_in_bytes(): + return 2 # 2 bytes + + def kai_num_blocks_per_row(k, bl): + assert (bl % kai_bl_multiple_of) == 0 + return kai_roundup(k, bl) // bl + + def kai_num_bytes_per_block(bl, num_bytes_multiplier_rhs): + assert (bl % kai_bl_multiple_of) == 0 + return (bl // 2) + num_bytes_multiplier_rhs + + return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + N, K, kai_nr, kai_kr, kai_sr, groupsize + ) + class WeightOnlyInt4Linear(torch.nn.Module): __constants__ = ["in_features", "out_features"] in_features: int @@ -576,21 +708,29 @@ def __init__( inner_k_tiles: int = 8, precision: torch.dtype = torch.bfloat16, scales_precision: torch.dtype = torch.bfloat16, + scheme = None, ) -> None: super().__init__() - self.padding = not _check_linear_int4_k(in_features, groupsize, inner_k_tiles) + self.padding = scheme is None and not _check_linear_int4_k(in_features, groupsize, inner_k_tiles) if self.padding: self.origin_in_features = in_features in_features = find_multiple(in_features, 1024) self.in_features = in_features self.out_features = out_features - assert not bias, "require bias=False" self.device = device self.groupsize = groupsize self.inner_k_tiles = inner_k_tiles self.precision = precision self.scales_precision = scales_precision + self.scheme = scheme + self.bias = bias + + self.custom_forward = linear_forward_int4 + if torch.backends.kleidiai.is_available() and (self.scheme == "symmetric_channelwise" or self.scheme == "symmetric_groupwise"): + self.custom_forward = linear_forward_int4_dynamic_quantization_4bit + else: + self.scheme = None if dtype is not None: raise ValueError("Please specify 'precision' instead of 'dtype'") @@ -600,17 +740,36 @@ def __init__( in_features % (inner_k_tiles * 16) == 0 ), "require in_features % (innerKTiles * 16) == 0" if is_device(device.type, "cpu"): - self.register_buffer( - "weight", - torch.zeros( - ( - out_features, - in_features // 2, + if scheme is None: + assert not bias, "require bias=False" + self.register_buffer( + "weight", + torch.zeros( + ( + out_features, + in_features // 2, + ), + dtype=torch.uint8, + device=device, ), - dtype=torch.uint8, - device=device, - ), - ) + ) + else: + if torch.backends.kleidiai.is_available() and ( + (groupsize == in_features and scales_precision == torch.float) + or ( + groupsize < in_features + and groupsize % 32 == 0 + and in_features % groupsize == 0 + and scales_precision == torch.bfloat16 + ) + ): + packed_weight_size = get_kai_packed_weight_size(4,out_features, in_features, groupsize) + self.register_buffer( + "weight", + torch.empty((packed_weight_size), dtype=torch.uint8) + ) + else: + raise ValueError("KleidiAI backend can not be initiated") else: self.register_buffer( "weight", @@ -625,30 +784,49 @@ def __init__( device=device, ), ) - self.dtype = dtype - self.register_buffer( - "scales_and_zeros", - torch.zeros( - (in_features // groupsize, out_features, 2), - dtype=self.scales_precision, - device=device, - ), - ) + if scheme is None: + self.register_buffer( + "scales_and_zeros", + torch.zeros( + (in_features // groupsize, out_features, 2), + dtype=self.scales_precision, + device=device, + ), + ) + elif scheme == "symmetric_channelwise": + self.register_buffer( + "scales_and_zeros", + torch.zeros( + (out_features), + dtype=self.scales_precision, + device=device, + ), + ) + elif scheme == "symmetric_groupwise": + self.register_buffer( + "scales_and_zeros", + torch.zeros( + (out_features, in_features//groupsize), + dtype=self.scales_precision, + device=device, + ), + ) + def forward(self, input: torch.Tensor) -> torch.Tensor: if self.padding: input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) - return linear_forward_int4( + return self.custom_forward( input, self.weight, self.scales_and_zeros, self.out_features, + self.in_features, self.groupsize, self.precision, self.scales_precision, ) - def _replace_linear_int4( module: torch.nn.Module, groupsize: int, @@ -659,27 +837,31 @@ def _replace_linear_int4( scales_precision: torch.dtype = torch.bfloat16, linear_class: Type[torch.nn.Module] = WeightOnlyInt4Linear, copy_weights: bool = False, + scheme=None, ): for name, child in module.named_children(): - # TODO: support linear bias - if ( - isinstance(child, nn.Linear) - and child.bias is None - and (skip_layer_func is None or not skip_layer_func(child.weight)) - ): - if ( - _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) - or padding_allowed - ): + if isinstance(child, nn.Linear) and (skip_layer_func is None or not skip_layer_func(child.weight)): + if scheme is not None or _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) or padding_allowed: + bias = False + if child.bias is not None: + bias = child.bias + if scheme == "symmetric_channelwise": + groupsize = child.in_features + scales_precision = torch.float32 + elif scheme == "symmetric_groupwise": + # Scales are actually f16 but they are packed along with weights + # To maintain api compatibility we populate scaled with no element in this scheme + scales_precision = torch.bfloat16 new_linear = linear_class( child.in_features, child.out_features, - bias=False, + bias=bias, device=child.weight.device, groupsize=groupsize, inner_k_tiles=inner_k_tiles, precision=precision, scales_precision=scales_precision, + scheme=scheme, ) # TODO: merge with 8da4w? # In distributed training, the model may be instantiated @@ -699,6 +881,7 @@ def _replace_linear_int4( scales_precision, linear_class, copy_weights, + scheme=scheme, ) @@ -723,10 +906,11 @@ def __init__( inner_k_tiles: Optional[int] = 8, device: torch.device = torch.device("cuda"), precision: torch.dtype = torch.bfloat16, + scheme = None ) -> None: super().__init__() assert inner_k_tiles in [2, 4, 8] - assert groupsize in [32, 64, 128, 256] + assert groupsize in [32, 64, 128, 256] # 0 group size is allowed for channelwise scheme where groupsize = row size self.inner_k_tiles = inner_k_tiles self.groupsize: int = groupsize @@ -734,6 +918,7 @@ def __init__( self.device: torch.device = device # precision and dtype are being used interchangeably here self.precision: torch.dtype = precision + self.scheme = scheme @torch.no_grad() def _create_quantized_state_dict( @@ -742,18 +927,19 @@ def _create_quantized_state_dict( cur_state_dict = model.state_dict() for fqn, mod in model.named_modules(): if isinstance(mod, torch.nn.Linear): - assert not mod.bias + if self.scheme is None: + assert not mod.bias out_features = mod.out_features in_features = mod.in_features # assert out_features % 8 == 0, "require out_features % 8 == 0" - logging.info(f"linear: {fqn}, in={in_features}, out={out_features}") - assert ( - in_features % self.groupsize == 0 - ), f"require in_features:{in_features} % self.groupsize:{self.groupsize} == 0" + if self.scheme is None: + assert ( + in_features % self.groupsize == 0 + ), f"require in_features:{in_features} % self.groupsize:{self.groupsize} == 0" weight = mod.weight.data - if not _check_linear_int4_k( + if self.scheme is None and not _check_linear_int4_k( in_features, self.groupsize, self.inner_k_tiles ): if self.padding_allowed: @@ -772,30 +958,42 @@ def _create_quantized_state_dict( + "and that groupsize and inner_k_tiles*16 evenly divide into it" ) continue - (w_int4x8, scales_and_zeros) = groupwise_affine_quantize_tensor( - weight, - 4, # n_bit - self.groupsize, - self.precision, # dtype for scales_and_zeros - ) - # TODO: just get the device from mod.weight.device? - if ( - is_device(w_int4x8.device.type, "cpu") - and TORCH_VERSION_AT_LEAST_2_6 - ): - weight_int4pack = ( - torch.ops.aten._convert_weight_to_int4pack_for_cpu( - w_int4x8.to(self.device), self.inner_k_tiles - ) + if (self.scheme == "symmetric_channelwise" or self.scheme == "symmetric_groupwise") and is_device(weight.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6: + if self.scheme == "symmetric_channelwise": + self.groupsize = mod.in_features + ( + w_int4x8, + scales_and_zeros + ) = prepare_int4_weight_and_scales_and_zeros( + weight.to(self.precision), self.groupsize, self.inner_k_tiles, self.scheme, precision=self.precision ) + weight_int4pack = torch.ops.aten._dyn_quant_pack_4bit_weight(w_int4x8.to(self.device), scales_and_zeros, mod.bias, self.groupsize, mod.in_features, mod.out_features) else: - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( - w_int4x8.to(self.device), self.inner_k_tiles + (w_int4x8, scales_and_zeros) = groupwise_affine_quantize_tensor( + weight, + 4, # n_bit + self.groupsize, + self.precision, # dtype for scales_and_zeros ) + # TODO: just get the device from mod.weight.device? + if ( + is_device(w_int4x8.device.type, "cpu") + and TORCH_VERSION_AT_LEAST_2_6 + ): + weight_int4pack = ( + torch.ops.aten._convert_weight_to_int4pack_for_cpu( + w_int4x8.to(self.device), self.inner_k_tiles + ) + ) + else: + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + w_int4x8.to(self.device), self.inner_k_tiles + ) cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to(self.device) cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to( - self.device - ) + self.device + ) + logging.info(f"quantizing linear: {fqn}, in={in_features}, out={out_features}, to scheme={self.scheme}, using blocksize={self.groupsize}") return cur_state_dict def _convert_for_runtime(self, model: torch.nn.Module) -> torch.nn.Module: @@ -807,6 +1005,7 @@ def _convert_for_runtime(self, model: torch.nn.Module) -> torch.nn.Module: skip_layer_func=None, precision=self.precision, scales_precision=self.precision, + scheme=self.scheme, ) return model diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index 74c136ad00..401ec1d93f 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -454,6 +454,7 @@ def groupwise_affine_dequantize_tensor_from_qparams( ) + def groupwise_affine_quantize_tensor(w, n_bit=4, groupsize=128, dtype=torch.bfloat16): scales, zeros = get_groupwise_affine_qparams(w, n_bit, groupsize, dtype) w_int4x8 = groupwise_affine_quantize_tensor_from_qparams( @@ -463,6 +464,100 @@ def groupwise_affine_quantize_tensor(w, n_bit=4, groupsize=128, dtype=torch.bflo return w_int4x8, scales_and_zeros +def get_group_qparams(w, n_bit=4, groupsize=128, scheme="symmetric_channelwise", precision=torch.bfloat16): + if groupsize > w.shape[-1] or scheme == "symmetric_channelwise": + groupsize = w.shape[-1] + assert groupsize > 1 + assert w.shape[-1] % groupsize == 0 + assert w.dim() == 2 + + to_quant = w.reshape(-1, groupsize) + assert torch.isnan(to_quant).sum() == 0 + + # improved symmetric 4 bit quantization that uses bin correspondingto -8 from [-8,7] ( -2^(b-1) , 2^(b-1)-1 ) range + if scheme == "symmetric_groupwise": + to_quant_abs = to_quant.abs() + max_abs_indices = to_quant_abs.argmax(dim=1, keepdim=True) + max_val = torch.gather(to_quant, 1, max_abs_indices) + scales = max_val / -8 + zeros = torch.zeros_like(scales) + + elif scheme == "symmetric_channelwise": + to_quant_abs = to_quant.abs() + max_abs_indices = to_quant_abs.argmax(dim=1, keepdim=True) + max_val = torch.gather(to_quant, 1, max_abs_indices) + scales = max_val / -8 + zeros = torch.zeros_like(scales) + return scales.to(precision).reshape(w.shape[0], -1), zeros.to( + precision + ).reshape(w.shape[0], -1) + + +def group_quantize_tensor_from_qparams(w, scales, zeros, n_bit=4, groupsize=128, scheme="symmetric_channelwise", precision=torch.bfloat16): + assert groupsize > 1 + if groupsize > w.shape[-1] and scales.shape[-1] == 1: + groupsize = w.shape[-1] + + assert w.shape[-1] % groupsize == 0 + assert w.dim() == 2 + + to_quant = w.reshape(-1, groupsize) + assert torch.isnan(to_quant).sum() == 0 + + scales = scales.reshape(-1, 1) + zeros = zeros.reshape(-1, 1) + if scheme == "symmetric_groupwise": + max_int = 2**n_bit - 1 + w_int8 = ( + to_quant.div(scales) + .add(8.5) + .to(torch.int8) + .clamp(max=max_int) + ) + elif scheme == "symmetric_channelwise": + max_int = 2**n_bit - 1 + w_int8 = ( + to_quant.div(scales) + .add(8.5) + .to(torch.int8) + .clamp(max=max_int) + ) + # We pack every odd index value in upper 4 bits and every even index value in lower 4 bits + w_uint8 = (w_int8[::, 1::2] << 4 | w_int8[::, ::2]).to(torch.uint8) + return w_uint8 + + +def pack_scales_and_zeros(scales, zeros, scheme="symmetric_channelwise"): + assert scales.shape == zeros.shape + scales_zeros = scales.squeeze().contiguous() + return scales_zeros + + +def group_quantize_tensor(w, n_bit=4, groupsize=128, scheme="symmetric_channelwise", precision=torch.bfloat16): + scales, zeros = get_group_qparams( + w, n_bit, groupsize, scheme=scheme, precision=precision) + w_uint8 = group_quantize_tensor_from_qparams( + w, scales, zeros, n_bit, groupsize, scheme=scheme) + scales_and_zeros = pack_scales_and_zeros(scales, zeros, scheme=scheme) + return w_uint8, scales_and_zeros + + +def prepare_int4_weight_and_scales_and_zeros(weights, groupsize, inner_k_tiles, scheme="symmetric_channelwise", precision=torch.bfloat16): + assert weights.dim() == 2 + assert groupsize > 1 + if groupsize > weights.shape[-1] or scheme == "symmetric_channelwise": + groupsize = weights.shape[-1] + assert weights.shape[-1] % groupsize == 0 + weight_int4pack, scales_and_zeros = group_quantize_tensor( + weights, n_bit=4, groupsize=groupsize, scheme=scheme, precision=precision + ) + if scheme == "symmetric_channelwise": + scales_and_zeros = scales_and_zeros.to(dtype=torch.float32) + elif scheme == "symmetric_groupwise": + scales_and_zeros = scales_and_zeros.to(dtype=torch.bfloat16) + return weight_int4pack, scales_and_zeros + + def groupwise_affine_dequantize_tensor( w_int4x8, scales_and_zeros,