Skip to content

Commit

Permalink
hot fix for weights compression
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Mar 11, 2024
1 parent 72b0630 commit 01c8117
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
8 changes: 4 additions & 4 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,10 @@ def _from_transformers(
task = task + "-with-past"

# If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
if load_in_8bit is None or not quantization_config:
ov_config = None
if load_in_8bit is None and not quantization_config:
ov_export_config = None
else:
ov_config = OVConfig(dtype="fp32")
ov_export_config = OVConfig(dtype="fp32")

stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)

Expand All @@ -279,7 +279,7 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
ov_config=ov_config,
ov_config=ov_export_config,
stateful=stateful,
)

Expand Down
21 changes: 21 additions & 0 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,27 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self):
}
save_model_patch.aasert_called_with(saving_params)

def test_ovmodel_load_large_model_with_additional_quantization_config(self):
with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch:
model_mixin_patch.num_parameters.return_value = 2e9
with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
_ = OVModelForCausalLM.from_pretrained(
MODEL_NAMES["llama"],
export=True,
compile=False,
use_cache=False,
quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
)
# quantization will be performed later, using load_model
saving_params = {
"model": unittest.mock.ANY,
"path": unittest.mock.ANY,
"compression_option": "fp32",
"compression_ratio": None,
}
save_model_patch.aasert_called_with(saving_params)


class OVQuantizerQATest(unittest.TestCase):
SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),)
Expand Down

0 comments on commit 01c8117

Please sign in to comment.