diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 582111493f..6885cad1e0 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -165,6 +165,10 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] + if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: + position_ids = np.expand_dims(position_ids, 0) + position_ids = np.concatenate([position_ids, position_ids, position_ids], axis=0) + inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 240f4f9e3f..03b4b6f496 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1976,7 +1976,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] TASK = "image-text-to-text" REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"] @@ -1996,6 +1996,10 @@ def get_transformer_model_class(self, model_arch): from transformers import LlavaNextForConditionalGeneration return LlavaNextForConditionalGeneration + if model_arch == "qwen2_vl": + from transformers import Qwen2VLForConditionalGeneration + + return Qwen2VLForConditionalGeneration return AutoModelForCausalLM def _check_device_and_request(self, ov_model, expected_device, has_request): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 17d9dd1fbe..6571198a94 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -115,6 +115,7 @@ "qwen": "katuni4ka/tiny-random-qwen", "qwen2": "fxmarty/tiny-dummy-qwen2", "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe", + "qwen2_vl": "katuni4ka/tiny-random-qwen2vl", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer",