diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 844da3e315..37bc73ee24 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -220,7 +220,7 @@ def __init__( ) self.exllama_version = self.exllama_config["version"] - def select_quant_linear(self, device_map: Union[str, dict]): + def select_quant_linear(self, device_map: Union[str, dict], pack=False): if is_gptqmodel_available(): self.quant_linear = hf_select_quant_linear( bits=self.bits, @@ -231,6 +231,7 @@ def select_quant_linear(self, device_map: Union[str, dict]): meta=self.meta, device_map=device_map, backend=self.backend, + pack=pack, ) else: self.quant_linear = hf_select_quant_linear( @@ -301,7 +302,7 @@ def convert_model(self, model: nn.Module, **kwargs): ) del layers_to_be_replaced[name] - self.select_quant_linear(device_map=kwargs.get("device_map", None)) + self.select_quant_linear(device_map=kwargs.get("device_map", None), pack=False) self._replace_by_quant_layers(model, layers_to_be_replaced) @@ -761,7 +762,7 @@ def pack_model( layers = get_layers(model) layers = {n: layers[n] for n in quantizers} - self.select_quant_linear(device_map=model.hf_device_map) + self.select_quant_linear(device_map=model.hf_device_map, pack=True) self._replace_by_quant_layers(model, quantizers) qlayers = get_layers(model, [self.quant_linear])