diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 3015b20e8..09f2eaa10 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -355,144 +355,150 @@ class StoreAttr(object): loading_kwargs["torch_dtype"] = dtype patch_16bit = True - if library_name == "open_clip": - model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir) - else: - model = TasksManager.get_model_from_task( - task, - model_name_or_path, - subfolder=subfolder, - revision=revision, - cache_dir=cache_dir, - token=token, - local_files_only=local_files_only, - force_download=force_download, - trust_remote_code=trust_remote_code, - framework=framework, - device=device, - library_name=library_name, - **loading_kwargs, - ) + try: + if library_name == "open_clip": + model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir) + else: + model = TasksManager.get_model_from_task( + task, + model_name_or_path, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + framework=framework, + device=device, + library_name=library_name, + **loading_kwargs, + ) + + needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None - needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None + if needs_pad_token_id: + if pad_token_id is not None: + model.config.pad_token_id = pad_token_id + else: + tok = AutoTokenizer.from_pretrained(model_name_or_path) + pad_token_id = getattr(tok, "pad_token_id", None) + if pad_token_id is None: + raise ValueError( + "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument" + ) + model.config.pad_token_id = pad_token_id - if needs_pad_token_id: - if pad_token_id is not None: - model.config.pad_token_id = pad_token_id + if hasattr(model.config, "export_model_type"): + model_type = model.config.export_model_type.replace("_", "-") else: - tok = AutoTokenizer.from_pretrained(model_name_or_path) - pad_token_id = getattr(tok, "pad_token_id", None) - if pad_token_id is None: - raise ValueError( - "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument" + model_type = model.config.model_type.replace("_", "-") + + if ( + not custom_architecture + and library_name != "diffusers" + and task + "-with-past" + in TasksManager.get_supported_tasks_for_model_type( + model_type, exporter="openvino", library_name=library_name + ) + ): + # Make -with-past the default if --task was not explicitely specified + if original_task == "auto": + task = task + "-with-past" + else: + logger.info( + f"The task `{task}` was manually specified, and past key values will not be reused in the decoding." + f" if needed, please pass `--task {task}-with-past` to export using the past key values." ) - model.config.pad_token_id = pad_token_id - if hasattr(model.config, "export_model_type"): - model_type = model.config.export_model_type.replace("_", "-") - else: - model_type = model.config.model_type.replace("_", "-") - - if ( - not custom_architecture - and library_name != "diffusers" - and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="openvino", library_name=library_name) - ): - # Make -with-past the default if --task was not explicitely specified if original_task == "auto": - task = task + "-with-past" - else: - logger.info( - f"The task `{task}` was manually specified, and past key values will not be reused in the decoding." - f" if needed, please pass `--task {task}-with-past` to export using the past key values." - ) + synonyms_for_task = sorted(TasksManager.synonyms_for_task(task)) + if synonyms_for_task: + synonyms_for_task = ", ".join(synonyms_for_task) + possible_synonyms = f" (possible synonyms are: {synonyms_for_task})" + else: + possible_synonyms = "" + logger.info(f"Automatic task detection to {task}{possible_synonyms}.") - if original_task == "auto": - synonyms_for_task = sorted(TasksManager.synonyms_for_task(task)) - if synonyms_for_task: - synonyms_for_task = ", ".join(synonyms_for_task) - possible_synonyms = f" (possible synonyms are: {synonyms_for_task})" - else: - possible_synonyms = "" - logger.info(f"Automatic task detection to {task}{possible_synonyms}.") + preprocessors = maybe_load_preprocessors( + model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code + ) - preprocessors = maybe_load_preprocessors( - model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code - ) + submodel_paths = export_from_model( + model=model, + output=output, + task=task, + ov_config=ov_config, + stateful=stateful, + model_kwargs=model_kwargs, + custom_export_configs=custom_export_configs, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + device=device, + trust_remote_code=trust_remote_code, + patch_16bit_model=patch_16bit, + **kwargs_shapes, + ) - submodel_paths = export_from_model( - model=model, - output=output, - task=task, - ov_config=ov_config, - stateful=stateful, - model_kwargs=model_kwargs, - custom_export_configs=custom_export_configs, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - device=device, - trust_remote_code=trust_remote_code, - patch_16bit_model=patch_16bit, - **kwargs_shapes, - ) + if convert_tokenizer: + maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task) - if convert_tokenizer: - maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task) - - clear_class_registry() - del model - gc.collect() - - for submodel_path in submodel_paths: - submodel_path = Path(output) / submodel_path - submodel = core.read_model(submodel_path) - - quantization_config = None - if ov_config is None: - num_parameters = 0 - for op in submodel.get_ops(): - if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]: - num_parameters += reduce(operator.mul, op.shape, 1) - del op - if num_parameters >= _MAX_UNCOMPRESSED_SIZE: - if is_nncf_available(): - quantization_config = {"bits": 8, "sym": False} - logger.info("The model weights will be quantized to int8_asym.") - else: - logger.warning( - "The model will be converted with no weights quantization. Quantization of the weights to int8 " - "requires nncf. Please install it with `pip install nncf`" - ) - break - else: - quantization_config = ov_config.quantization_config - if quantization_config is None: - del submodel - gc.collect() - continue + clear_class_registry() + del model + gc.collect() - if not is_nncf_available(): - raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`") + for submodel_path in submodel_paths: + submodel_path = Path(output) / submodel_path + submodel = core.read_model(submodel_path) + + quantization_config = None + if ov_config is None: + num_parameters = 0 + for op in submodel.get_ops(): + if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]: + num_parameters += reduce(operator.mul, op.shape, 1) + del op + if num_parameters >= _MAX_UNCOMPRESSED_SIZE: + if is_nncf_available(): + quantization_config = {"bits": 8, "sym": False} + logger.info("The model weights will be quantized to int8_asym.") + else: + logger.warning( + "The model will be converted with no weights quantization. Quantization of the weights to int8 " + "requires nncf. Please install it with `pip install nncf`" + ) + break + else: + quantization_config = ov_config.quantization_config + if quantization_config is None: + del submodel + gc.collect() + continue + + if not is_nncf_available(): + raise ImportError( + "Quantization of the weights requires nncf, please install it with `pip install nncf`" + ) - from optimum.intel.openvino.quantization import _weight_only_quantization + from optimum.intel.openvino.quantization import _weight_only_quantization - _weight_only_quantization(submodel, quantization_config) - compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" - save_model(submodel, compressed_submodel_path, compress_to_fp16=False) - del submodel - gc.collect() + _weight_only_quantization(submodel, quantization_config) + compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" + save_model(submodel, compressed_submodel_path, compress_to_fp16=False) + del submodel + gc.collect() - submodel_path.unlink() - submodel_path.with_suffix(".bin").unlink() - compressed_submodel_path.rename(submodel_path) - compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin")) + submodel_path.unlink() + submodel_path.with_suffix(".bin").unlink() + compressed_submodel_path.rename(submodel_path) + compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin")) - # Unpatch modules after GPTQ export - if do_quant_patching: - torch.cuda.is_available = orig_cuda_check - if do_gptq_patching: - GPTQQuantizer.post_init_model = orig_post_init_model + finally: + # Unpatch modules after quantized model export + if do_quant_patching: + torch.cuda.is_available = orig_cuda_check + if do_gptq_patching: + GPTQQuantizer.post_init_model = orig_post_init_model def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None): diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index b3245e01c..0f166a635 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -921,9 +921,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if platform.system() != "Windows": SUPPORTED_ARCHITECTURES += ("opt_gptq",) - # autoawq install disabled for windows test environment - if is_openvino_version(">=", "2024.6.0") and platform.system() != "Windows": - SUPPORTED_ARCHITECTURES += ("mixtral_awq",) + # autoawq install disabled for windows test environment + if is_openvino_version(">=", "2024.6.0") and platform.system() != "Windows": + SUPPORTED_ARCHITECTURES += ("mixtral_awq",) GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = (