From c26a450cf1be0d203f7d36a823cf856a7dfcc87d Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 29 Nov 2024 15:02:13 +0400 Subject: [PATCH 01/12] qwen2vl support --- .../sentence_transformer_quantization.ipynb | 12 +- optimum/exporters/openvino/model_configs.py | 264 ++++++++++- optimum/exporters/openvino/model_patcher.py | 114 ++++- optimum/exporters/openvino/utils.py | 2 +- .../openvino/modeling_visual_language.py | 411 +++++++++++++++++- 5 files changed, 781 insertions(+), 22 deletions(-) diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb index 714544aa9a..067e1c9913 100644 --- a/notebooks/openvino/sentence_transformer_quantization.ipynb +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -170,9 +170,11 @@ ], "source": [ "from functools import partial\n", - "import datasets\n", + "\n", "from transformers import AutoTokenizer\n", - "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n", + "\n", + "from optimum.intel import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer\n", + "\n", "\n", "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "base_model_path = \"all-MiniLM-L6-v2\"\n", @@ -187,6 +189,7 @@ "\n", "quantizer = OVQuantizer.from_pretrained(model)\n", "\n", + "\n", "def preprocess_function(examples, tokenizer):\n", " return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n", "\n", @@ -225,9 +228,9 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import Pipeline\n", - "import torch.nn.functional as F\n", "import torch\n", + "import torch.nn.functional as F\n", + "from transformers import Pipeline\n", "\n", "\n", "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n", @@ -296,6 +299,7 @@ "from datasets import load_dataset\n", "from evaluate import load\n", "\n", + "\n", "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n", "metric = load(\"glue\", \"stsb\")" ] diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d9c0165d98..4ec92a6302 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -89,6 +89,8 @@ Phi3ModelPatcher, Phi3VisionImageEmbeddingsPatcher, QwenModelPatcher, + Qwen2VLLanguageModelPatcher, + Qwen2VLVisionEmbMergerPatcher, RotaryEmbPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, @@ -106,9 +108,13 @@ def init_model_configs(): "transformers", "LlavaNextForConditionalGeneration", ) - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ - "image-text-to-text" - ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] + TasksManager._CUSTOM_CLASSES[("pt", "qwen2-vl", "image-text-to-text")] = ( + "transformers", + "Qwen2VLForConditionalGeneration", + ) + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["image-text-to-text"] = ( + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] + ) supported_model_types = [ "_SUPPORTED_MODEL_TYPE", @@ -1288,18 +1294,26 @@ def patch_model_for_export( class LMInputEmbedsConfigHelper(TextDecoderWithPositionIdsOnnxConfig): - def __init__(self, export_config): + def __init__(self, export_config, patcher_cls=None, dummy_input_generator=None, inputs_update=None): self.orig_export_config = export_config + if dummy_input_generator is not None: + export_config.DUMMY_INPUT_GENERATOR_CLASSES = ( + dummy_input_generator, + ) + export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DUMMY_INPUT_GENERATOR_CLASSES = export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DEFAULT_ONNX_OPSET = export_config.DEFAULT_ONNX_OPSET self.DUMMY_PKV_GENERATOR_CLASS = export_config.DUMMY_PKV_GENERATOR_CLASS self._config = export_config._config self._normalized_config = export_config._normalized_config self.use_past = export_config.use_past + self.patcher_cls = patcher_cls + self.input_info_upd = inputs_update def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": + if self.patcher_cls is not None: + return self.patcher_cls(self, model, model_kwargs=model_kwargs) # Refer to DecoderModelPatcher. return self.orig_export_config.patch_model_for_export(model, model_kwargs=model_kwargs) @@ -1312,6 +1326,8 @@ def inputs(self) -> Dict[str, Dict[int, str]]: orig_inputs = self.orig_export_config.inputs input_ids_config = orig_inputs.pop("input_ids") orig_inputs["inputs_embeds"] = input_ids_config + if self.input_info_upd is not None: + orig_inputs.update(self.input_info_upd) return orig_inputs def generate_dummy_inputs(self, framework: str = "pt", **kwargs): @@ -1383,9 +1399,22 @@ def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dt return export_config -def get_vlm_text_generation_config(model_type, model_config, int_dtype, float_dtype): +def get_vlm_text_generation_config( + model_type, + model_config, + int_dtype, + float_dtype, + model_patcher=None, + dummy_input_generator=None, + inputs_update=None, +): internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype) - export_config = LMInputEmbedsConfigHelper(internal_export_config) + export_config = LMInputEmbedsConfigHelper( + internal_export_config, + patcher_cls=model_patcher, + dummy_input_generator=dummy_input_generator, + inputs_update=inputs_update, + ) export_config._normalized_config = internal_export_config._normalized_config return export_config @@ -1820,9 +1849,11 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int img_ids_height = self.height // 2 img_ids_width = self.width // 2 return self.random_int_tensor( - [self.batch_size, img_ids_height * img_ids_width, 3] - if is_diffusers_version("<", "0.31.0") - else [img_ids_height * img_ids_width, 3], + ( + [self.batch_size, img_ids_height * img_ids_width, 3] + if is_diffusers_version("<", "0.31.0") + else [img_ids_height * img_ids_width, 3] + ), min_value=0, max_value=min(img_ids_height, img_ids_width), framework=framework, @@ -2259,3 +2290,218 @@ def patch_model_for_export( if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) + + +class DummyQwen2VLLMInputGenerator(DummyTextInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + generated_input = super().generate(input_name, framework, int_dtype, float_dtype) + if input_name == "position_ids": + return generated_input.unsqueeze(0).expand(3, -1, -1) + return generated_input + + +class DummyQwen2VLVisionEMbedInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("hidden_states",) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = 1, + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = 420, + height: int = 420, + **kwargs, + ): + self.batch_size = batch_size + self.height = height + self.width = width + self.num_channels = num_channels + self.temporal_patch_size = normalized_config.config.temporal_patch_size + self.patch_size = normalized_config.config.patch_size + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size + grid_t = self.batch_size + shape = [ + grid_t * grid_h * grid_w, + self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size, + ] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + + +class DummyQwen2VLVisionEmbedMergerInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("hidden_states", "attention_mask", "rotary_pos_emb") + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = 1, + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = 420, + height: int = 420, + **kwargs, + ): + self.batch_size = batch_size + self.height = height + self.width = width + self.num_channels = num_channels + self.temporal_patch_size = normalized_config.config.temporal_patch_size + self.patch_size = normalized_config.config.patch_size + self.embed_dim = normalized_config.config.embed_dim + self.num_heads = normalized_config.config.num_heads + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size + grid_t = self.batch_size + + if input_name == "hidden_states": + return self.random_float_tensor( + [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype + ) + + if input_name == "attention_mask": + return self.random_mask_tensor( + [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype + ) + + if input_name == "rotary_pos_emb": + dim = self.embed_dim // self.num_heads // 2 + return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype) + + +class Qwen2VLConfigBehavior(str, enum.Enum): + LANGUAGE = "language" + VISION_EMBEDDINGS = "vision_embeddings" + VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger" + TEXT_EMBEDDINGS = "text_embeddings" + + +@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers") +class Qwen2VLOpenVINOConfig(OnnxConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: Qwen2VLConfigBehavior = Qwen2VLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._behavior = behavior + self._orig_config = config + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedMergerInputGenerator,) + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return model + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + vision_embeddings = model.visual.patch_embed + vision_embeddings.config = model.config.vision_config + return vision_embeddings + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + vision_emb_merger = model.visual + vision_emb_merger.config = model.config.vision_config + return vision_emb_merger + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.model.embed_tokens + text_embedding.config = model.config + return text_embedding + + def with_behavior( + self, + behavior: Union[str, Qwen2VLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen2", + self._orig_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen2VLLanguageModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Qwen2VLVisionEmbMergerPatcher(self, model, model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: + return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return { + "hidden_states": {0: "sequence_length"}, + "attention_mask": {1: "sequence_length", 2: "sequence_length"}, + "rotary_pos_emb": {0: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]: + return {"last_hidden_state": {0: "seq_len"}} + return {} diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c71cbfe003..af830849eb 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,9 +18,11 @@ import math import types from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from optimum.exporters.onnx.base import OnnxConfig import torch import torch.nn.functional as F +from transformers import PreTrainedModel, TFPreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available @@ -421,9 +423,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2060,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -3378,3 +3380,103 @@ def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) for block in self._model.model.layers: block.self_attn.forward = block.self_attn._orig_forward + + +class Qwen2VLLanguageModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: OnnxConfig, + model: PreTrainedModel | TFPreTrainedModel, + model_kwargs: Dict[str, Any] | None = None, + ): + + model.__orig_forward = model.forward + + def forward_wrap( + self, + attention_mask, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + input_ids=None, + ): + from transformers.cache_utils import DynamicCache + + new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + result = self.__orig_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=new_past_key_values, + inputs_embeds=inputs_embeds, + ) + if past_key_values is not None: + result["past_key_values"] = result["past_key_values"].to_legacy_cache() + return result + + model.forward = types.MethodType(forward_wrap, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: OnnxConfig, + model: PreTrainedModel | TFPreTrainedModel, + model_kwargs: Dict[str, Any] | None = None, + ): + model.__orig_forward = model.forward + + def image_embed_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor + ) -> torch.Tensor: + for blk in self.blocks: + hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) + return self.merger(hidden_states) + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + def sdpa_attn_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor = None + ) -> torch.Tensor: + from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_rotary_pos_emb_vision + + seq_length = hidden_states.shape[0] + q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0) + k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0) + + q = q.transpose(0, 1) + k = k.transpose(0, 1) + v = v.transpose(0, 1) + attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0) + attn_output = attn_output.transpose(0, 1) + attn_output = attn_output.reshape(seq_length, -1) + attn_output = self.proj(attn_output) + return attn_output + + def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + for block in self._model.blocks: + block._orig_forward = block.forward + block.forward = types.MethodType(block_forward, block) + block.attn._orig_forward = block.attn.forward + block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 302fee5b65..5242db1d1a 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -216,7 +216,7 @@ def get_submodels(model): return custom_export, fn_get_submodels -MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"] +MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v", "qwen2-vl"] def save_config(config, save_dir): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index aa9b23a0ee..7ca3a0cf15 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1,10 +1,11 @@ import copy +from dataclasses import dataclass import logging import os import warnings from abc import abstractmethod from pathlib import Path -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, Any import numpy as np import openvino as ov @@ -22,6 +23,7 @@ PreTrainedTokenizer, ) from transformers.modeling_outputs import BaseModelOutputWithPooling +from transformers.utils import ModelOutput from ...exporters.openvino import main_export from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name @@ -216,7 +218,12 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: key.get_any_name() for key in self.model.outputs[2:] if "hidden_states" in key.get_any_name() ] self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} - self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values" + if model_has_input_output_name(self.model, "images"): + self._main_input = "images" + elif model_has_input_output_name(self.model, "hidden_states"): + self._main_input = "hidden_states" + else: + self._main_input = "pixel_values" def forward(self, pixel_values, **kwargs): self._compile() @@ -269,6 +276,7 @@ def forward(self, img_features): "language_model": OVModelWithEmbedForCausalLM, "vision_embeddings": OVVisionEmbedding, "vision_projection": OVVisionProjection, + "vision_embeddings_merger": OVVisionEmbedding, } @@ -676,6 +684,10 @@ def forward( position_ids=None, image_bound=None, tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, **kwargs, ): inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( @@ -687,6 +699,10 @@ def forward( past_key_values=past_key_values, image_bound=image_bound, tgt_sizes=tgt_sizes, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, **kwargs, ) return self.language_model.forward( @@ -781,6 +797,9 @@ def prepare_inputs_for_generation( "image_sizes": image_sizes, "image_bound": kwargs.get("image_bound"), "tgt_sizes": kwargs.get("tgt_sizes"), + "pixel_values_videos": kwargs.get("pixel_values_videos"), + "image_grid_thw": kwargs.get("image_grid_thw"), + "video_grid_thw": kwargs.get("video_grid_thw"), } ) return model_inputs @@ -2036,6 +2055,393 @@ def preprocess_inputs( return inputs +@dataclass +class QWen2VLModelOutputWithPast(ModelOutput): + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + rope_deltas: Optional[torch.FloatTensor] = None + + +class _OVQwen2VLForCausalLM(OVModelForVisualCausalLM): + additional_parts = ["vision_embeddings_merger"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = True, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + super().__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + try: + from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding + + self._rotary_pos_emb = VisionRotaryEmbedding( + self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2 + ) + except ImportError: + raise ValueError( + f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" + ) + + def get_rope_index( + self, + input_ids: torch.LongTensor, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the 3D rope index based on image and video's temporal, height and width in LLM. + """ + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_token_id = self.config.video_token_id + vision_start_token_id = self.config.vision_start_token_id + mrope_position_deltas = [] + if image_grid_thw is not None or video_grid_thw is not None: + total_input_ids = input_ids + position_ids = torch.ones( + 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device + ) + image_index, video_index = 0, 0 + for i, input_ids in enumerate(total_input_ids): + if attention_mask is not None: + input_ids = input_ids[attention_mask[i] == 1] + image_nums, video_nums = 0, 0 + vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) + vision_tokens = input_ids[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + input_tokens = input_ids.tolist() + llm_pos_ids_list: list = [] + st = 0 + remain_images, remain_videos = image_nums, video_nums + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + if past_key_values is not None: + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + rope_deltas = kwargs.get("rope_deltas", None) + if attention_mask is not None and position_ids is None: + if cache_position is None or (cache_position is not None and cache_position[0] == 0): + position_ids, rope_deltas = self.get_rope_index( + input_ids, image_grid_thw, video_grid_thw, attention_mask + ) + else: + batch_size, seq_length = input_ids.shape + delta = ( + cache_position[0] + rope_deltas if cache_position is not None and rope_deltas is not None else 0 + ) + position_ids = torch.arange(seq_length, device=input_ids.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + if cache_position[0] != 0: + pixel_values = None + pixel_values_videos = None + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + "rope_deltas": rope_deltas, + } + ) + return model_inputs + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + num_new_tokens: int = 1, + ) -> Dict[str, Any]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + + if getattr(outputs, "rope_deltas", None) is not None: + model_kwargs["rope_deltas"] = outputs.rope_deltas + + return model_kwargs + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = self.vision_embeddings(pixel_values)[0] + rotary_pos_emb = self.rot_pos_emb(grid_thw) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb + )[0] + return res + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_multimodal_embeddings( + self, + input_ids, + pixel_values=None, + attention_mask=None, + position_ids=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + if pixel_values is not None and input_ids.shape[1] != 1: + image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw)) + image_mask = input_ids == self.config.image_token_id + inputs_embeds[image_mask] = image_embeds + if pixel_values_videos is not None and input_ids.shape[1] != 1: + pixel_values_videos = pixel_values_videos + video_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values_videos, video_grid_thw)) + video_mask = input_ids == self.config.video_token_id + inputs_embeds[video_mask] = video_embeds + return inputs_embeds, attention_mask, position_ids + + def forward( + self, + input_ids, + pixel_values=None, + past_key_values=None, + inputs_embeds=None, + image_sizes=None, + attention_mask=None, + position_ids=None, + image_bound=None, + tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, + **kwargs, + ): + result = super().forward( + input_ids, + pixel_values, + past_key_values, + inputs_embeds, + image_sizes, + attention_mask, + position_ids, + image_bound, + tgt_sizes, + pixel_values_videos, + image_grid_thw, + video_grid_thw, + rope_deltas, + **kwargs, + ) + final_result = QWen2VLModelOutputWithPast( + logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas + ) + return final_result + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if image is not None: + conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + }, + {"type": "text", "text": text}, + ], + } + ] + else: + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + ], + } + ] + text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=text_prompt, return_tensors="pt") + return inputs + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, @@ -2043,4 +2449,5 @@ def preprocess_inputs( "llava-qwen2": _OVNanoLlavaForCausalLM, "phi3_v": _OVPhi3VisionForCausalLM, "internvl_chat": _OVInternVLForCausalLM, + "qwen2_vl": _OVQwen2VLForCausalLM, } From 973f155a2ab5f5ff5dc7c11d70d1fa367237c40b Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 4 Dec 2024 09:08:42 +0400 Subject: [PATCH 02/12] fix code style --- optimum/exporters/openvino/model_configs.py | 8 +++---- optimum/exporters/openvino/model_patcher.py | 23 +++++++++---------- optimum/exporters/openvino/utils.py | 10 +++++++- .../openvino/modeling_visual_language.py | 5 ++-- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4ec92a6302..7e0be486ef 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -88,9 +88,9 @@ PersimmonModelPatcher, Phi3ModelPatcher, Phi3VisionImageEmbeddingsPatcher, - QwenModelPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, + QwenModelPatcher, RotaryEmbPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, @@ -112,9 +112,9 @@ def init_model_configs(): "transformers", "Qwen2VLForConditionalGeneration", ) - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["image-text-to-text"] = ( - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] - ) + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ + "image-text-to-text" + ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] supported_model_types = [ "_SUPPORTED_MODEL_TYPE", diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index af830849eb..cad605c920 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,7 +18,6 @@ import math import types from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from optimum.exporters.onnx.base import OnnxConfig import torch import torch.nn.functional as F @@ -26,6 +25,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available +from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, ModelPatcher, override_arguments from optimum.intel.utils.import_utils import ( _openvino_version, @@ -423,9 +423,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -2060,9 +2060,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -3386,10 +3386,9 @@ class Qwen2VLLanguageModelPatcher(DecoderModelPatcher): def __init__( self, config: OnnxConfig, - model: PreTrainedModel | TFPreTrainedModel, - model_kwargs: Dict[str, Any] | None = None, + model: Union[PreTrainedModel, TFPreTrainedModel], + model_kwargs: Dict[str, Any] = None, ): - model.__orig_forward = model.forward def forward_wrap( @@ -3426,8 +3425,8 @@ class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, config: OnnxConfig, - model: PreTrainedModel | TFPreTrainedModel, - model_kwargs: Dict[str, Any] | None = None, + model: Union[PreTrainedModel, TFPreTrainedModel], + model_kwargs: Dict[str, Any] = None, ): model.__orig_forward = model.forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 5242db1d1a..9891395a38 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -216,7 +216,15 @@ def get_submodels(model): return custom_export, fn_get_submodels -MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v", "qwen2-vl"] +MULTI_MODAL_TEXT_GENERATION_MODELS = [ + "llava", + "llava-next", + "llava-qwen2", + "internvl-chat", + "minicpmv", + "phi3-v", + "qwen2-vl", +] def save_config(config, save_dir): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 7ca3a0cf15..582111493f 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1,11 +1,11 @@ import copy -from dataclasses import dataclass import logging import os import warnings from abc import abstractmethod +from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, Any +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union import numpy as np import openvino as ov @@ -2354,7 +2354,6 @@ def get_multimodal_embeddings( video_grid_thw=None, **kwargs, ): - inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) if pixel_values is not None and input_ids.shape[1] != 1: image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw)) From 7af7cdcfbada0dcd4a2e3917adbeac3f587c3f07 Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 4 Dec 2024 10:59:13 +0400 Subject: [PATCH 03/12] add test case --- optimum/intel/openvino/modeling_visual_language.py | 4 ++++ tests/openvino/test_modeling.py | 6 +++++- tests/openvino/utils_tests.py | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 582111493f..6885cad1e0 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -165,6 +165,10 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] + if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: + position_ids = np.expand_dims(position_ids, 0) + position_ids = np.concatenate([position_ids, position_ids, position_ids], axis=0) + inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 240f4f9e3f..03b4b6f496 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1976,7 +1976,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] TASK = "image-text-to-text" REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"] @@ -1996,6 +1996,10 @@ def get_transformer_model_class(self, model_arch): from transformers import LlavaNextForConditionalGeneration return LlavaNextForConditionalGeneration + if model_arch == "qwen2_vl": + from transformers import Qwen2VLForConditionalGeneration + + return Qwen2VLForConditionalGeneration return AutoModelForCausalLM def _check_device_and_request(self, ov_model, expected_device, has_request): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 17d9dd1fbe..6571198a94 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -115,6 +115,7 @@ "qwen": "katuni4ka/tiny-random-qwen", "qwen2": "fxmarty/tiny-dummy-qwen2", "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe", + "qwen2_vl": "katuni4ka/tiny-random-qwen2vl", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", From 02e6dc95bef72018a4bf8ebeb9fc97861708f3f9 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 5 Dec 2024 16:13:58 +0100 Subject: [PATCH 04/12] Added compression tests for qwen2-vl --- tests/openvino/test_exporters_cli.py | 7 +++++++ tests/openvino/test_quantization.py | 16 ++++++++++++++++ tests/openvino/utils_tests.py | 1 + 3 files changed, 24 insertions(+) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index f94d0f4b5d..38d0a7263c 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -177,6 +177,13 @@ class OVCLIExportTestCase(unittest.TestCase): "--dataset contextual --num-samples 1 --trust-remote-code", {"int8": 4, "int4": 14}, ), + ( + "image-text-to-text", + "qwen2_vl", + 'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" ' + "--dataset contextual --num-samples 1 --trust-remote-code", + {"int8": 8, "int4": 22}, + ), ] ) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index e97eed5aed..6adcd0a9b0 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -415,6 +415,21 @@ class OVWeightCompressionTest(unittest.TestCase): ), {"int4": 14, "int8": 4}, ), + ( + OVModelForVisualCausalLM, + "qwen2_vl", + True, + dict( + bits=4, + group_size=16, + dataset="contextual", + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + num_samples=1, + trust_remote_code=True, + ), + {"int4": 20, "int8": 10}, + ), ] ) @@ -439,6 +454,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.45.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True)) + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False)) SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 6571198a94..e37ad5baeb 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -188,6 +188,7 @@ "llava_next": (30, 9, 1), "minicpmv": (30, 26, 1, 6), "nanollava": (30, 15, 1), + "qwen2_vl": (30, 1, 1, 10), } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From 14c186f6cd1b7970641a98148f978839177ae4a4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 5 Dec 2024 16:30:15 +0100 Subject: [PATCH 05/12] Remove trust_remote_code --- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/test_quantization.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 38d0a7263c..702eb19c04 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -181,7 +181,7 @@ class OVCLIExportTestCase(unittest.TestCase): "image-text-to-text", "qwen2_vl", 'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" ' - "--dataset contextual --num-samples 1 --trust-remote-code", + "--dataset contextual --num-samples 1", {"int8": 8, "int4": 22}, ), ] diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 6adcd0a9b0..986cda0c47 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -418,7 +418,7 @@ class OVWeightCompressionTest(unittest.TestCase): ( OVModelForVisualCausalLM, "qwen2_vl", - True, + False, dict( bits=4, group_size=16, @@ -426,7 +426,6 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", num_samples=1, - trust_remote_code=True, ), {"int4": 20, "int8": 10}, ), From 163dadd53e0995c5e275dae5e097e9724c49b294 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 6 Dec 2024 10:08:23 +0400 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: Nikita Savelyev --- optimum/intel/openvino/modeling_visual_language.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 6885cad1e0..2405b68626 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -166,8 +166,7 @@ def prepare_inputs( position_ids = position_ids[:, -inputs_embeds.shape[1] :] if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: - position_ids = np.expand_dims(position_ids, 0) - position_ids = np.concatenate([position_ids, position_ids, position_ids], axis=0) + position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -2132,7 +2131,6 @@ def get_rope_index( for i, input_ids in enumerate(total_input_ids): if attention_mask is not None: input_ids = input_ids[attention_mask[i] == 1] - image_nums, video_nums = 0, 0 vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) vision_tokens = input_ids[vision_start_indices + 1] image_nums = (vision_tokens == image_token_id).sum() From 550d8a634d163c9dabcc5ee1f081fb86bf7ea4b7 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 6 Dec 2024 11:32:14 +0400 Subject: [PATCH 07/12] revert changes in notebook --- .../openvino/sentence_transformer_quantization.ipynb | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb index 067e1c9913..714544aa9a 100644 --- a/notebooks/openvino/sentence_transformer_quantization.ipynb +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -170,11 +170,9 @@ ], "source": [ "from functools import partial\n", - "\n", + "import datasets\n", "from transformers import AutoTokenizer\n", - "\n", - "from optimum.intel import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer\n", - "\n", + "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n", "\n", "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "base_model_path = \"all-MiniLM-L6-v2\"\n", @@ -189,7 +187,6 @@ "\n", "quantizer = OVQuantizer.from_pretrained(model)\n", "\n", - "\n", "def preprocess_function(examples, tokenizer):\n", " return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n", "\n", @@ -228,9 +225,9 @@ "metadata": {}, "outputs": [], "source": [ - "import torch\n", - "import torch.nn.functional as F\n", "from transformers import Pipeline\n", + "import torch.nn.functional as F\n", + "import torch\n", "\n", "\n", "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n", @@ -299,7 +296,6 @@ "from datasets import load_dataset\n", "from evaluate import load\n", "\n", - "\n", "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n", "metric = load(\"glue\", \"stsb\")" ] From 2f92e6e1f02b251dbdd5b262d768fca20a521da3 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 17 Dec 2024 18:21:09 +0400 Subject: [PATCH 08/12] apply review comments --- optimum/exporters/openvino/model_configs.py | 44 ++++--------------- .../openvino/modeling_visual_language.py | 5 ++- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7e0be486ef..c07e1544f2 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2300,37 +2300,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return generated_input -class DummyQwen2VLVisionEMbedInputGenerator(DummyVisionInputGenerator): - SUPPORTED_INPUT_NAMES = ("hidden_states",) - - def __init__( - self, - task: str, - normalized_config: NormalizedVisionConfig, - batch_size: int = 1, - num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], - width: int = 420, - height: int = 420, - **kwargs, - ): - self.batch_size = batch_size - self.height = height - self.width = width - self.num_channels = num_channels - self.temporal_patch_size = normalized_config.config.temporal_patch_size - self.patch_size = normalized_config.config.patch_size - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size - grid_t = self.batch_size - shape = [ - grid_t * grid_h * grid_w, - self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) - - -class DummyQwen2VLVisionEmbedMergerInputGenerator(DummyVisionInputGenerator): +class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("hidden_states", "attention_mask", "rotary_pos_emb") def __init__( @@ -2349,7 +2319,10 @@ def __init__( self.num_channels = num_channels self.temporal_patch_size = normalized_config.config.temporal_patch_size self.patch_size = normalized_config.config.patch_size - self.embed_dim = normalized_config.config.embed_dim + if normalized_config.use_embed_dim: + self.embed_dim = normalized_config.config.embed_dim + else: + self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size self.num_heads = normalized_config.config.num_heads def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): @@ -2382,7 +2355,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum): class Qwen2VLOpenVINOConfig(OnnxConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig - DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") def __init__( @@ -2405,12 +2378,13 @@ def __init__( self._orig_config = config if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config + self._config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) - self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEMbedInputGenerator,) + self._normalized_config.use_embed_dim = False if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) - self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedMergerInputGenerator,) + self._normalized_config.use_embed_dim = True @staticmethod def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 2405b68626..41c879e481 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -29,6 +29,7 @@ from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name from ...exporters.openvino.utils import save_config from .. import OVQuantizer +from ..utils.import_utils import is_transformers_version from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -2096,13 +2097,13 @@ def __init__( quantization_config=quantization_config, **kwargs, ) - try: + if is_transformers_version(">=", "4.45.0"): from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding self._rotary_pos_emb = VisionRotaryEmbedding( self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2 ) - except ImportError: + else: raise ValueError( f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" ) From f6fdfba44d6c9acb0f771aa4f864244b35af2bbe Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 17 Dec 2024 18:33:45 +0400 Subject: [PATCH 09/12] add comments for patching --- optimum/exporters/openvino/model_patcher.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index cad605c920..d8ab55d51c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3430,6 +3430,9 @@ def __init__( ): model.__orig_forward = model.forward + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118 + # added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len) + # separated patch_embed and rot_pos_emb calls for performing as part of another model def image_embed_forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor ) -> torch.Tensor: @@ -3441,6 +3444,8 @@ def image_embed_forward( super().__init__(config, model, model_kwargs) def __enter__(self): + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L390 + # added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len) def sdpa_attn_forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor = None ) -> torch.Tensor: @@ -3460,6 +3465,8 @@ def sdpa_attn_forward( attn_output = self.proj(attn_output) return attn_output + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L430 + # added attention_mask input propagation to self.attn def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb From 797f912c019108c2ecf6b2ba898f6a31d3263319 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 17 Dec 2024 18:58:10 +0400 Subject: [PATCH 10/12] reuse original methods if possile --- .../openvino/modeling_visual_language.py | 182 +----------------- 1 file changed, 10 insertions(+), 172 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 77781f61c0..3476d56930 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -5,6 +5,7 @@ from abc import abstractmethod from dataclasses import dataclass from pathlib import Path +from types import MethodType from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union import numpy as np @@ -2099,187 +2100,24 @@ def __init__( **kwargs, ) if is_transformers_version(">=", "4.45.0"): - from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding + from transformers.models.qwen2_vl.modeling_qwen2_vl import ( + Qwen2VLForConditionalGeneration, + VisionRotaryEmbedding, + ) self._rotary_pos_emb = VisionRotaryEmbedding( self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2 ) + self.get_rope_index = MethodType(Qwen2VLForConditionalGeneration.get_rope_index, self) + self.prepare_inputs_for_generation = MethodType( + Qwen2VLForConditionalGeneration.prepare_inputs_for_generation, self + ) else: raise ValueError( f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" ) - def get_rope_index( - self, - input_ids: torch.LongTensor, - image_grid_thw: Optional[torch.LongTensor] = None, - video_grid_thw: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Calculate the 3D rope index based on image and video's temporal, height and width in LLM. - """ - spatial_merge_size = self.config.vision_config.spatial_merge_size - image_token_id = self.config.image_token_id - video_token_id = self.config.video_token_id - vision_start_token_id = self.config.vision_start_token_id - mrope_position_deltas = [] - if image_grid_thw is not None or video_grid_thw is not None: - total_input_ids = input_ids - position_ids = torch.ones( - 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device - ) - image_index, video_index = 0, 0 - for i, input_ids in enumerate(total_input_ids): - if attention_mask is not None: - input_ids = input_ids[attention_mask[i] == 1] - vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) - vision_tokens = input_ids[vision_start_indices + 1] - image_nums = (vision_tokens == image_token_id).sum() - video_nums = (vision_tokens == video_token_id).sum() - input_tokens = input_ids.tolist() - llm_pos_ids_list: list = [] - st = 0 - remain_images, remain_videos = image_nums, video_nums - for _ in range(image_nums + video_nums): - if image_token_id in input_tokens and remain_images > 0: - ed_image = input_tokens.index(image_token_id, st) - else: - ed_image = len(input_tokens) + 1 - if video_token_id in input_tokens and remain_videos > 0: - ed_video = input_tokens.index(video_token_id, st) - else: - ed_video = len(input_tokens) + 1 - if ed_image < ed_video: - t, h, w = ( - image_grid_thw[image_index][0], - image_grid_thw[image_index][1], - image_grid_thw[image_index][2], - ) - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = ( - video_grid_thw[video_index][0], - video_grid_thw[video_index][1], - video_grid_thw[video_index][2], - ) - video_index += 1 - remain_videos -= 1 - ed = ed_video - llm_grid_t, llm_grid_h, llm_grid_w = ( - t.item(), - h.item() // spatial_merge_size, - w.item() // spatial_merge_size, - ) - text_len = ed - st - - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - - t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() - h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() - w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() - llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) - mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) - mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) - return position_ids, mrope_position_deltas - else: - if attention_mask is not None: - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device) - max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] - mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] - else: - position_ids = ( - torch.arange(input_ids.shape[1], device=input_ids.device) - .view(1, 1, -1) - .expand(3, input_ids.shape[0], -1) - ) - mrope_position_deltas = torch.zeros( - [input_ids.shape[0], 1], - device=input_ids.device, - dtype=input_ids.dtype, - ) - - return position_ids, mrope_position_deltas - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - rope_deltas = kwargs.get("rope_deltas", None) - if attention_mask is not None and position_ids is None: - if cache_position is None or (cache_position is not None and cache_position[0] == 0): - position_ids, rope_deltas = self.get_rope_index( - input_ids, image_grid_thw, video_grid_thw, attention_mask - ) - else: - batch_size, seq_length = input_ids.shape - delta = ( - cache_position[0] + rope_deltas if cache_position is not None and rope_deltas is not None else 0 - ) - position_ids = torch.arange(seq_length, device=input_ids.device) - position_ids = position_ids.view(1, -1).expand(batch_size, -1) - position_ids = position_ids.add(delta) - position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) - - if cache_position[0] != 0: - pixel_values = None - pixel_values_videos = None - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "pixel_values": pixel_values, - "pixel_values_videos": pixel_values_videos, - "image_grid_thw": image_grid_thw, - "video_grid_thw": video_grid_thw, - "rope_deltas": rope_deltas, - } - ) - return model_inputs - + # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602 def _update_model_kwargs_for_generation( self, outputs: ModelOutput, From f791b94fbe872ae6f84122ed809c6ba69ac30dbb Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 18 Dec 2024 09:20:40 +0400 Subject: [PATCH 11/12] Update optimum/intel/openvino/modeling_visual_language.py --- optimum/intel/openvino/modeling_visual_language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 3476d56930..430ad3acb1 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2155,7 +2155,8 @@ def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb )[0] return res - + # Adopted from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1089 + # Use config values instead of model attributes, replace self.rotary_pos_emb -> self._rotary_pos_emb def rot_pos_emb(self, grid_thw): pos_ids = [] for t, h, w in grid_thw: From 0eb94f5058647e50019e8138a87bb2a1ab5987e9 Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 18 Dec 2024 09:25:26 +0400 Subject: [PATCH 12/12] fix typings in patchers --- optimum/exporters/openvino/model_patcher.py | 10 ++++------ optimum/intel/openvino/modeling_visual_language.py | 1 + 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index d8ab55d51c..825eaac487 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -21,11 +21,9 @@ import torch import torch.nn.functional as F -from transformers import PreTrainedModel, TFPreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available -from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, ModelPatcher, override_arguments from optimum.intel.utils.import_utils import ( _openvino_version, @@ -3385,8 +3383,8 @@ def __exit__(self, exc_type, exc_value, traceback): class Qwen2VLLanguageModelPatcher(DecoderModelPatcher): def __init__( self, - config: OnnxConfig, - model: Union[PreTrainedModel, TFPreTrainedModel], + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Dict[str, Any] = None, ): model.__orig_forward = model.forward @@ -3424,8 +3422,8 @@ def __exit__(self, exc_type, exc_value, traceback): class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, - config: OnnxConfig, - model: Union[PreTrainedModel, TFPreTrainedModel], + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Dict[str, Any] = None, ): model.__orig_forward = model.forward diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 430ad3acb1..068dd70e07 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2155,6 +2155,7 @@ def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb )[0] return res + # Adopted from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1089 # Use config values instead of model attributes, replace self.rotary_pos_emb -> self._rotary_pos_emb def rot_pos_emb(self, grid_thw):