diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 1ffcabb48..802cd0241 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -88,6 +88,8 @@ PersimmonModelPatcher, Phi3ModelPatcher, Phi3VisionImageEmbeddingsPatcher, + Qwen2VLLanguageModelPatcher, + Qwen2VLVisionEmbMergerPatcher, QwenModelPatcher, RotaryEmbPatcher, UpdateCausalMaskModelPatcher, @@ -106,6 +108,10 @@ def init_model_configs(): "transformers", "LlavaNextForConditionalGeneration", ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen2-vl", "image-text-to-text")] = ( + "transformers", + "Qwen2VLForConditionalGeneration", + ) TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ "image-text-to-text" ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] @@ -1288,18 +1294,26 @@ def patch_model_for_export( class LMInputEmbedsConfigHelper(TextDecoderWithPositionIdsOnnxConfig): - def __init__(self, export_config): + def __init__(self, export_config, patcher_cls=None, dummy_input_generator=None, inputs_update=None): self.orig_export_config = export_config + if dummy_input_generator is not None: + export_config.DUMMY_INPUT_GENERATOR_CLASSES = ( + dummy_input_generator, + ) + export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DUMMY_INPUT_GENERATOR_CLASSES = export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DEFAULT_ONNX_OPSET = export_config.DEFAULT_ONNX_OPSET self.DUMMY_PKV_GENERATOR_CLASS = export_config.DUMMY_PKV_GENERATOR_CLASS self._config = export_config._config self._normalized_config = export_config._normalized_config self.use_past = export_config.use_past + self.patcher_cls = patcher_cls + self.input_info_upd = inputs_update def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": + if self.patcher_cls is not None: + return self.patcher_cls(self, model, model_kwargs=model_kwargs) # Refer to DecoderModelPatcher. return self.orig_export_config.patch_model_for_export(model, model_kwargs=model_kwargs) @@ -1312,6 +1326,8 @@ def inputs(self) -> Dict[str, Dict[int, str]]: orig_inputs = self.orig_export_config.inputs input_ids_config = orig_inputs.pop("input_ids") orig_inputs["inputs_embeds"] = input_ids_config + if self.input_info_upd is not None: + orig_inputs.update(self.input_info_upd) return orig_inputs def generate_dummy_inputs(self, framework: str = "pt", **kwargs): @@ -1383,9 +1399,22 @@ def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dt return export_config -def get_vlm_text_generation_config(model_type, model_config, int_dtype, float_dtype): +def get_vlm_text_generation_config( + model_type, + model_config, + int_dtype, + float_dtype, + model_patcher=None, + dummy_input_generator=None, + inputs_update=None, +): internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype) - export_config = LMInputEmbedsConfigHelper(internal_export_config) + export_config = LMInputEmbedsConfigHelper( + internal_export_config, + patcher_cls=model_patcher, + dummy_input_generator=dummy_input_generator, + inputs_update=inputs_update, + ) export_config._normalized_config = internal_export_config._normalized_config return export_config @@ -1821,9 +1850,11 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int img_ids_height = self.height // 2 img_ids_width = self.width // 2 return self.random_int_tensor( - [self.batch_size, img_ids_height * img_ids_width, 3] - if is_diffusers_version("<", "0.31.0") - else [img_ids_height * img_ids_width, 3], + ( + [self.batch_size, img_ids_height * img_ids_width, 3] + if is_diffusers_version("<", "0.31.0") + else [img_ids_height * img_ids_width, 3] + ), min_value=0, max_value=min(img_ids_height, img_ids_width), framework=framework, @@ -2260,3 +2291,192 @@ def patch_model_for_export( if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) + + +class DummyQwen2VLLMInputGenerator(DummyTextInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + generated_input = super().generate(input_name, framework, int_dtype, float_dtype) + if input_name == "position_ids": + return generated_input.unsqueeze(0).expand(3, -1, -1) + return generated_input + + +class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("hidden_states", "attention_mask", "rotary_pos_emb") + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = 1, + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = 420, + height: int = 420, + **kwargs, + ): + self.batch_size = batch_size + self.height = height + self.width = width + self.num_channels = num_channels + self.temporal_patch_size = normalized_config.config.temporal_patch_size + self.patch_size = normalized_config.config.patch_size + if normalized_config.use_embed_dim: + self.embed_dim = normalized_config.config.embed_dim + else: + self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size + self.num_heads = normalized_config.config.num_heads + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size + grid_t = self.batch_size + + if input_name == "hidden_states": + return self.random_float_tensor( + [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype + ) + + if input_name == "attention_mask": + return self.random_mask_tensor( + [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype + ) + + if input_name == "rotary_pos_emb": + dim = self.embed_dim // self.num_heads // 2 + return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype) + + +class Qwen2VLConfigBehavior(str, enum.Enum): + LANGUAGE = "language" + VISION_EMBEDDINGS = "vision_embeddings" + VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger" + TEXT_EMBEDDINGS = "text_embeddings" + + +@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers") +class Qwen2VLOpenVINOConfig(OnnxConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: Qwen2VLConfigBehavior = Qwen2VLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._behavior = behavior + self._orig_config = config + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = False + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return model + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + vision_embeddings = model.visual.patch_embed + vision_embeddings.config = model.config.vision_config + return vision_embeddings + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + vision_emb_merger = model.visual + vision_emb_merger.config = model.config.vision_config + return vision_emb_merger + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.model.embed_tokens + text_embedding.config = model.config + return text_embedding + + def with_behavior( + self, + behavior: Union[str, Qwen2VLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen2", + self._orig_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen2VLLanguageModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Qwen2VLVisionEmbMergerPatcher(self, model, model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: + return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return { + "hidden_states": {0: "sequence_length"}, + "attention_mask": {1: "sequence_length", 2: "sequence_length"}, + "rotary_pos_emb": {0: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]: + return {"last_hidden_state": {0: "seq_len"}} + return {} diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c71cbfe00..825eaac48 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3378,3 +3378,109 @@ def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) for block in self._model.model.layers: block.self_attn.forward = block.self_attn._orig_forward + + +class Qwen2VLLanguageModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + def forward_wrap( + self, + attention_mask, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + input_ids=None, + ): + from transformers.cache_utils import DynamicCache + + new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + result = self.__orig_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=new_past_key_values, + inputs_embeds=inputs_embeds, + ) + if past_key_values is not None: + result["past_key_values"] = result["past_key_values"].to_legacy_cache() + return result + + model.forward = types.MethodType(forward_wrap, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118 + # added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len) + # separated patch_embed and rot_pos_emb calls for performing as part of another model + def image_embed_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor + ) -> torch.Tensor: + for blk in self.blocks: + hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) + return self.merger(hidden_states) + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L390 + # added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len) + def sdpa_attn_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor = None + ) -> torch.Tensor: + from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_rotary_pos_emb_vision + + seq_length = hidden_states.shape[0] + q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0) + k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0) + + q = q.transpose(0, 1) + k = k.transpose(0, 1) + v = v.transpose(0, 1) + attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0) + attn_output = attn_output.transpose(0, 1) + attn_output = attn_output.reshape(seq_length, -1) + attn_output = self.proj(attn_output) + return attn_output + + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L430 + # added attention_mask input propagation to self.attn + def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + for block in self._model.blocks: + block._orig_forward = block.forward + block.forward = types.MethodType(block_forward, block) + block.attn._orig_forward = block.attn.forward + block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index db4df6d0d..46b151e7d 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -217,7 +217,15 @@ def get_submodels(model): return custom_export, fn_get_submodels -MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"] +MULTI_MODAL_TEXT_GENERATION_MODELS = [ + "llava", + "llava-next", + "llava-qwen2", + "internvl-chat", + "minicpmv", + "phi3-v", + "qwen2-vl", +] def save_config(config, save_dir): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8d6edea0f..068dd70e0 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3,8 +3,10 @@ import os import warnings from abc import abstractmethod +from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from types import MethodType +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union import numpy as np import openvino as ov @@ -22,10 +24,12 @@ PreTrainedTokenizer, ) from transformers.modeling_outputs import BaseModelOutputWithPooling +from transformers.utils import ModelOutput from ...exporters.openvino import main_export from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name from ...exporters.openvino.utils import save_config +from ..utils.import_utils import is_transformers_version from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -162,6 +166,9 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] + if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: + position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) + inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: @@ -215,7 +222,12 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: key.get_any_name() for key in self.model.outputs[2:] if "hidden_states" in key.get_any_name() ] self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} - self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values" + if model_has_input_output_name(self.model, "images"): + self._main_input = "images" + elif model_has_input_output_name(self.model, "hidden_states"): + self._main_input = "hidden_states" + else: + self._main_input = "pixel_values" def forward(self, pixel_values, **kwargs): self._compile() @@ -268,6 +280,7 @@ def forward(self, img_features): "language_model": OVModelWithEmbedForCausalLM, "vision_embeddings": OVVisionEmbedding, "vision_projection": OVVisionProjection, + "vision_embeddings_merger": OVVisionEmbedding, } @@ -677,6 +690,10 @@ def forward( position_ids=None, image_bound=None, tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, **kwargs, ): inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( @@ -688,6 +705,10 @@ def forward( past_key_values=past_key_values, image_bound=image_bound, tgt_sizes=tgt_sizes, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, **kwargs, ) return self.language_model.forward( @@ -782,6 +803,9 @@ def prepare_inputs_for_generation( "image_sizes": image_sizes, "image_bound": kwargs.get("image_bound"), "tgt_sizes": kwargs.get("tgt_sizes"), + "pixel_values_videos": kwargs.get("pixel_values_videos"), + "image_grid_thw": kwargs.get("image_grid_thw"), + "video_grid_thw": kwargs.get("video_grid_thw"), } ) return model_inputs @@ -2037,6 +2061,230 @@ def preprocess_inputs( return inputs +@dataclass +class QWen2VLModelOutputWithPast(ModelOutput): + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + rope_deltas: Optional[torch.FloatTensor] = None + + +class _OVQwen2VLForCausalLM(OVModelForVisualCausalLM): + additional_parts = ["vision_embeddings_merger"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = True, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + super().__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + if is_transformers_version(">=", "4.45.0"): + from transformers.models.qwen2_vl.modeling_qwen2_vl import ( + Qwen2VLForConditionalGeneration, + VisionRotaryEmbedding, + ) + + self._rotary_pos_emb = VisionRotaryEmbedding( + self.config.vision_config.embed_dim // self.config.vision_config.num_heads // 2 + ) + self.get_rope_index = MethodType(Qwen2VLForConditionalGeneration.get_rope_index, self) + self.prepare_inputs_for_generation = MethodType( + Qwen2VLForConditionalGeneration.prepare_inputs_for_generation, self + ) + else: + raise ValueError( + f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" + ) + + # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602 + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + num_new_tokens: int = 1, + ) -> Dict[str, Any]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + + if getattr(outputs, "rope_deltas", None) is not None: + model_kwargs["rope_deltas"] = outputs.rope_deltas + + return model_kwargs + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = self.vision_embeddings(pixel_values)[0] + rotary_pos_emb = self.rot_pos_emb(grid_thw) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb + )[0] + return res + + # Adopted from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1089 + # Use config values instead of model attributes, replace self.rotary_pos_emb -> self._rotary_pos_emb + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_multimodal_embeddings( + self, + input_ids, + pixel_values=None, + attention_mask=None, + position_ids=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + if pixel_values is not None and input_ids.shape[1] != 1: + image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw)) + image_mask = input_ids == self.config.image_token_id + inputs_embeds[image_mask] = image_embeds + if pixel_values_videos is not None and input_ids.shape[1] != 1: + pixel_values_videos = pixel_values_videos + video_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values_videos, video_grid_thw)) + video_mask = input_ids == self.config.video_token_id + inputs_embeds[video_mask] = video_embeds + return inputs_embeds, attention_mask, position_ids + + def forward( + self, + input_ids, + pixel_values=None, + past_key_values=None, + inputs_embeds=None, + image_sizes=None, + attention_mask=None, + position_ids=None, + image_bound=None, + tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, + **kwargs, + ): + result = super().forward( + input_ids, + pixel_values, + past_key_values, + inputs_embeds, + image_sizes, + attention_mask, + position_ids, + image_bound, + tgt_sizes, + pixel_values_videos, + image_grid_thw, + video_grid_thw, + rope_deltas, + **kwargs, + ) + final_result = QWen2VLModelOutputWithPast( + logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas + ) + return final_result + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if image is not None: + conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + }, + {"type": "text", "text": text}, + ], + } + ] + else: + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + ], + } + ] + text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=text_prompt, return_tensors="pt") + return inputs + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, @@ -2044,4 +2292,5 @@ def preprocess_inputs( "llava-qwen2": _OVNanoLlavaForCausalLM, "phi3_v": _OVPhi3VisionForCausalLM, "internvl_chat": _OVInternVLForCausalLM, + "qwen2_vl": _OVQwen2VLForCausalLM, } diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 97cbe8ef2..7a3c824ee 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -178,6 +178,13 @@ class OVCLIExportTestCase(unittest.TestCase): "--dataset contextual --num-samples 1 --trust-remote-code", {"int8": 4, "int4": 14}, ), + ( + "image-text-to-text", + "qwen2_vl", + 'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" ' + "--dataset contextual --num-samples 1", + {"int8": 8, "int4": 22}, + ), ] ) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 240f4f9e3..03b4b6f49 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1976,7 +1976,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] TASK = "image-text-to-text" REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"] @@ -1996,6 +1996,10 @@ def get_transformer_model_class(self, model_arch): from transformers import LlavaNextForConditionalGeneration return LlavaNextForConditionalGeneration + if model_arch == "qwen2_vl": + from transformers import Qwen2VLForConditionalGeneration + + return Qwen2VLForConditionalGeneration return AutoModelForCausalLM def _check_device_and_request(self, ov_model, expected_device, has_request): diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index d02dea3f1..54261b88f 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -457,6 +457,20 @@ class OVWeightCompressionTest(unittest.TestCase): ), {"int4": 14, "int8": 4}, ), + ( + OVModelForVisualCausalLM, + "qwen2_vl", + False, + dict( + bits=4, + group_size=16, + dataset="contextual", + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + num_samples=1, + ), + {"int4": 20, "int8": 10}, + ), ] ) @@ -481,6 +495,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.45.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True)) + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False)) SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index bf509a044..dc8776082 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -115,6 +115,7 @@ "qwen": "katuni4ka/tiny-random-qwen", "qwen2": "fxmarty/tiny-dummy-qwen2", "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe", + "qwen2_vl": "katuni4ka/tiny-random-qwen2vl", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", @@ -187,6 +188,7 @@ "llava_next": (30, 9, 1), "minicpmv": (30, 26, 1, 6), "nanollava": (30, 15, 1), + "qwen2_vl": (30, 1, 1, 10), } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"