diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index ad2cfe882..62f70ad36 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -272,6 +272,7 @@ def _from_pretrained( subfolder=subfolder, ) file_names[name] = model_cache_path + decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config) else: encoder = cls._compile_model( file_names["encoder"], kwargs.get("device", "CPU"), kwargs.get("ov_config"), model_save_dir @@ -280,6 +281,20 @@ def _from_pretrained( file_names["decoder"], kwargs.get("device", "CPU"), kwargs.get("ov_config"), model_save_dir ) if use_cache and not model_has_state(decoder): + model_file_names["decoder_with_past"] = decoder_with_past_file_name + model_file_names["decoder_with_past_bin"] = decoder_with_past_file_name.replace(".xml", ".bin") + for name in ["decoder_with_past", "decoder_with_past_bin"]: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=model_file_names[name], + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + subfolder=subfolder, + ) + file_names[name] = model_cache_path decoder_with_past = cls._compile_model( file_names["decoder_with_past"], kwargs.get("device", "CPU"),