hot fix for weights compression

huggingface · Mar 11, 2024 · 01c8117 · 01c8117
1 parent 72b0630
commit 01c8117
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 4 deletions.
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -261,10 +261,10 @@ def _from_transformers(
                 task = task + "-with-past"
 
         # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None or not quantization_config:
-            ov_config = None
+        if load_in_8bit is None and not quantization_config:
+            ov_export_config = None
         else:
-            ov_config = OVConfig(dtype="fp32")
+            ov_export_config = OVConfig(dtype="fp32")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
 
@@ -279,7 +279,7 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            ov_config=ov_config,
+            ov_config=ov_export_config,
             stateful=stateful,
         )
 

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -490,6 +490,27 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self):
                     }
                     save_model_patch.aasert_called_with(saving_params)
 
+    def test_ovmodel_load_large_model_with_additional_quantization_config(self):
+        with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
+            model_mixin_patch.num_parameters.return_value = 2e9
+            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
+                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
+                    _ = OVModelForCausalLM.from_pretrained(
+                        MODEL_NAMES["llama"],
+                        export=True,
+                        compile=False,
+                        use_cache=False,
+                        quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
+                    )
+                    # quantization will be performed later, using load_model
+                    saving_params = {
+                        "model": unittest.mock.ANY,
+                        "path": unittest.mock.ANY,
+                        "compression_option": "fp32",
+                        "compression_ratio": None,
+                    }
+                    save_model_patch.aasert_called_with(saving_params)
+
 
 class OVQuantizerQATest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)