diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 92a2ce436d..3d9671caf1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -261,10 +261,10 @@ def _from_transformers( task = task + "-with-past" # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: - ov_config = None + if load_in_8bit is None and not quantization_config: + ov_export_config = None else: - ov_config = OVConfig(dtype="fp32") + ov_export_config = OVConfig(dtype="fp32") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) @@ -279,7 +279,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - ov_config=ov_config, + ov_config=ov_export_config, stateful=stateful, ) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a33e0339f3..f56daf9828 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -490,6 +490,27 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self): } save_model_patch.aasert_called_with(saving_params) + def test_ovmodel_load_large_model_with_additional_quantization_config(self): + with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: + model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: + with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], + export=True, + compile=False, + use_cache=False, + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + ) + # quantization will be performed later, using load_model + saving_params = { + "model": unittest.mock.ANY, + "path": unittest.mock.ANY, + "compression_option": "fp32", + "compression_ratio": None, + } + save_model_patch.aasert_called_with(saving_params) + class OVQuantizerQATest(unittest.TestCase): SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)