diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 92a2ce436d..3d9671caf1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -261,10 +261,10 @@ def _from_transformers( task = task + "-with-past" # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: - ov_config = None + if load_in_8bit is None and not quantization_config: + ov_export_config = None else: - ov_config = OVConfig(dtype="fp32") + ov_export_config = OVConfig(dtype="fp32") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) @@ -279,7 +279,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - ov_config=ov_config, + ov_config=ov_export_config, stateful=stateful, ) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a33e0339f3..57c45df6ec 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -459,36 +459,64 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): self.assertEqual(0, num_int8) def test_ovmodel_load_large_model_with_default_compressed_weights(self): - with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: - model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + model_parameters.return_value = [mock_tensor] with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: _ = OVModelForCausalLM.from_pretrained( MODEL_NAMES["llama"], export=True, compile=False, use_cache=False ) - saving_params = { - "model": unittest.mock.ANY, - "path": unittest.mock.ANY, - "compression_option": "int8", - "compression_ratio": None, - } - save_model_patch.aasert_called_with(saving_params) + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(quantization_config={"bits": 8}) + ) def test_ovmodel_load_large_model_with_uncompressed_weights(self): - with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: - model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + model_parameters.return_value = [mock_tensor] with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: _ = OVModelForCausalLM.from_pretrained( MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False ) - saving_params = { - "model": unittest.mock.ANY, - "path": unittest.mock.ANY, - "compression_option": "fp32", - "compression_ratio": None, - } - save_model_patch.aasert_called_with(saving_params) + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(dtype="fp32") + ) + + def test_ovmodel_load_large_model_with_additional_quantization_config(self): + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: + with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: + with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], + export=True, + compile=False, + use_cache=False, + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + ) + # quantization will be performed later, using load_model + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(dtype="fp32") + ) + compression_params = { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "ratio": 0.8, + "group_size": -1, + "all_layers": None, + "sensitivity_metric": None, + "dataset": None, + "ignored_scope": None, + } + compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) class OVQuantizerQATest(unittest.TestCase):