From 22d93e74ceffba796d8fb0dd47d99680be4b5608 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Thu, 12 Dec 2024 15:17:18 +0200 Subject: [PATCH 1/4] Add ONNX export support for MGP-STR (#2099) * Enable mpg-str ONNX export * No longer needed * Improve model patcher * Formatting * `ruff` * Also support image-to-text task * Add unit tests * Add listed support for MGP-STR --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 16 ++++++++++++ optimum/exporters/onnx/model_patcher.py | 26 ++++++++++++++++++++ optimum/exporters/tasks.py | 7 +++++- tests/exporters/exporters_utils.py | 2 ++ tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 6 files changed, 52 insertions(+), 1 deletion(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 57005b85678..46ab3cb8a64 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -65,6 +65,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Marian - MarkupLM - MBart +- MGP-STR - Mistral - MobileBert - MobileVit diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index b39d19ec782..85e235f9a96 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -82,6 +82,7 @@ from .model_patcher import ( CLIPModelPatcher, FalconModelPatcher, + MgpstrModelPatcher, MistralModelPatcher, MusicgenModelPatcher, SAMModelPatcher, @@ -933,6 +934,21 @@ def torch_to_onnx_input_map(self) -> Dict[str, str]: return {"x": "pixel_values"} +class MgpstrOnnxConfig(ViTOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "char_logits": {0: "batch_size"}, + "bpe_logits": {0: "batch_size"}, + "wp_logits": {0: "batch_size"}, + } + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return MgpstrModelPatcher(self, model, model_kwargs=model_kwargs) + + class SentenceTransformersTransformerOnnxConfig(TextEncoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DEFAULT_ONNX_OPSET = 14 # Some bottleneck transformers models require a specific ONNX opset to be successfully exported. We put a rather high opset here for the export to work for all architectures. diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 2c0f9aeba67..083bc127999 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -509,6 +509,32 @@ def patched_forward(*args, **kwargs): self.patched_forward = patched_forward +class MgpstrModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + signature = inspect.signature(self.orig_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + # logits is a tuple, so we unpack it and return them as separate outputs + char_logits, bpe_logits, wp_logits = self.orig_forward(*args, **kwargs).logits + + return { + "char_logits": char_logits, + "bpe_logits": bpe_logits, + "wp_logits": wp_logits, + } + + self.patched_forward = patched_forward + + class SAMModelPatcher(ModelPatcher): def __init__( self, diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 0a3758e97cf..ba17730f9d9 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -211,7 +211,7 @@ class TasksManager: "image-classification": "AutoModelForImageClassification", "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), "image-to-image": "AutoModelForImageToImage", - "image-to-text": "AutoModelForVision2Seq", + "image-to-text": ("AutoModelForVision2Seq", "AutoModel"), "mask-generation": "AutoModel", "masked-im": "AutoModelForMaskedImageModeling", "multiple-choice": "AutoModelForMultipleChoice", @@ -824,6 +824,11 @@ class TasksManager: "question-answering", onnx="MBartOnnxConfig", ), + "mgp-str": supported_tasks_mapping( + "feature-extraction", + "image-to-text", + onnx="MgpstrOnnxConfig", + ), "mistral": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 32156d9eebf..5f071e0f9eb 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -116,6 +116,7 @@ "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet-v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -247,6 +248,7 @@ "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "sshleifer/tiny-mbart", + "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", # "mobilenet_v1": "google/mobilenet_v1_0.75_192", # "mobilenet_v2": "google/mobilenet_v2_0.35_96", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index cccecd53817..c33c07fc7b1 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -118,6 +118,7 @@ "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", From 3f007661377956402439f2ea0567d28b930ca38c Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 13 Dec 2024 17:58:40 +0200 Subject: [PATCH 2/4] Add ONNX export support for OLMo and OLMo2 (#2121) Add support for OLMo and OLMo2 Co-authored-by: Ella Charlaix --- docs/source/exporters/onnx/overview.mdx | 2 ++ optimum/exporters/onnx/model_configs.py | 9 +++++++++ optimum/exporters/tasks.py | 14 ++++++++++++++ tests/exporters/exporters_utils.py | 2 ++ 4 files changed, 27 insertions(+) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 46ab3cb8a64..fbe7b42c44f 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -75,6 +75,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - MT5 - Musicgen (text-conditional only) - Nystromformer +- OLMo +- OLMo2 - OWL-ViT - Pegasus - Perceiver diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 85e235f9a96..1c838408807 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -325,6 +325,15 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +class OlmoOnnxConfig(LlamaOnnxConfig): + ATOL_FOR_VALIDATION = 1e-4 + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") + + +class Olmo2OnnxConfig(OlmoOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.47.0") + + class Qwen2OnnxConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.37.0") diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index ba17730f9d9..32e90c7da19 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -954,6 +954,20 @@ class TasksManager: "text-generation-with-past", onnx="GraniteOnnxConfig", ), + "olmo": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="OlmoOnnxConfig", + ), + "olmo2": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="Olmo2OnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 5f071e0f9eb..e04a850bc8c 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -127,6 +127,8 @@ "mt5": "lewtun/tiny-random-mt5", "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "hf-internal-testing/tiny-random-OlmoForCausalLM", + "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM", "opt": "hf-internal-testing/tiny-random-OPTModel", "owlv2": "hf-internal-testing/tiny-random-Owlv2Model", "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel", From 4daa40896f693649e21696c509cd98c7e0c40e3c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 13 Dec 2024 17:03:26 +0100 Subject: [PATCH 3/4] Pass on `model_kwargs` when loading a sentence-transformers model before export (#2126) --- optimum/exporters/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 32e90c7da19..4db4130302d 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -2141,6 +2141,7 @@ def get_model_from_task( use_auth_token = model_kwargs.pop("use_auth_token", None) token = model_kwargs.pop("token", None) trust_remote_code = model_kwargs.pop("trust_remote_code", False) + model_kwargs["torch_dtype"] = torch_dtype if use_auth_token is not None: warnings.warn( @@ -2158,6 +2159,7 @@ def get_model_from_task( token=token, revision=revision, trust_remote_code=trust_remote_code, + model_kwargs=model_kwargs, ) else: try: From 0c42291f9dbdcaf52dc1cab44c25452d853a96b7 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Thu, 19 Dec 2024 10:47:39 +0200 Subject: [PATCH 4/4] Add ONNX export support for DinoV2, Hiera, Maskformer, PVT, SigLIP, SwinV2, VitMAE, and VitMSN models (#2001) * Add support for siglip models * cleanup * remove submodule * Add ONNX export for DinoV2 models * Use height and width from preprocessor * formatting * Remove attention mask from model input * Add ONNX export support for Hiera models * Add ONNX export support for SwinV2 * Upgrade Siglip to opset=14 * Add VQA task * Add ONNX export support for Maskformer * Add ONNX export support for PVT * Add ONNX export support for ViTMAE and ViTMSN * Add siglip unit tests * Add vit-mae unit tests * Code formatting * Add maskformer to list of supported models * Formatting * fix typo * remove vit-mae masked-im task * remove vit-msn masked-im task * fix output names for maskformer export --------- Co-authored-by: Ella Charlaix --- docs/source/exporters/onnx/overview.mdx | 8 ++ optimum/exporters/onnx/model_configs.py | 118 +++++++++++++++++++ optimum/exporters/tasks.py | 70 ++++++++++- optimum/onnxruntime/modeling_ort.py | 4 +- optimum/onnxruntime/utils.py | 1 + optimum/utils/normalized_config.py | 4 + tests/exporters/exporters_utils.py | 16 +++ tests/onnxruntime/test_modeling.py | 1 + tests/onnxruntime/utils_onnxruntime_tests.py | 7 +- 9 files changed, 224 insertions(+), 5 deletions(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index fbe7b42c44f..b5129c23f21 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -39,6 +39,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Decision Transformer - Deit - Detr +- DINOv2 - DistilBert - Donut-Swin - Electra @@ -53,6 +54,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - GPT-NeoX - OPT - GroupVit +- Hiera - Hubert - IBert - LayoutLM @@ -64,6 +66,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - M2-M100 - Marian - MarkupLM +- MaskFormer - MBart - MGP-STR - Mistral @@ -84,6 +87,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Phi3 - Pix2Struct - PoolFormer +- PVT - Qwen2(Qwen1.5) - RegNet - RemBERT @@ -95,10 +99,12 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - SEW - SEW-D - Speech2Text +- SigLIP - SpeechT5 - Splinter - SqueezeBert - Swin +- SwinV2 - T5 - Table Transformer - TROCR @@ -106,6 +112,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - UniSpeech SAT - Vision Encoder Decoder - Vit +- VitMAE +- VitMSN - Wav2Vec2 - Wav2Vec2 Conformer - WavLM diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 1c838408807..4c5a727a183 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -847,6 +847,65 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class HieraOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class PvtOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class VitMAEOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class VitMSNOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class Dinov2DummyInputGenerator(DummyVisionInputGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + num_channels=num_channels, + width=width, + height=height, + **kwargs, + ) + + from transformers.onnx.utils import get_preprocessor + + preprocessor = get_preprocessor(normalized_config._name_or_path) + if preprocessor is not None and hasattr(preprocessor, "crop_size"): + self.height = preprocessor.crop_size.get("height", self.height) + self.width = preprocessor.crop_size.get("width", self.width) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + input_ = super().generate( + input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype + ) + return input_ + + +class Dinov2OnnxConfig(ViTOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,) + + class MobileViTOnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 DEFAULT_ONNX_OPSET = 11 @@ -888,6 +947,10 @@ class SwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class SwinV2OnnxConfig(SwinOnnxConfig): + pass + + class Swin2srOnnxConfig(SwinOnnxConfig): pass @@ -923,6 +986,28 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig): pass +class MaskFormerOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 12, try exporting with this version. + DEFAULT_ONNX_OPSET = 12 + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self.task == "image-segmentation": + return { + "class_queries_logits": {0: "batch_size", 1: "num_queries"}, + "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"}, + } + else: + return super().outputs + + @property + def torch_to_onnx_output_map(self) -> Dict[str, str]: + return { + "transformer_decoder_last_hidden_state": "last_hidden_state", + } + + class DonutSwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 @@ -1115,6 +1200,39 @@ def patch_model_for_export( return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) +class SiglipNormalizedConfig(CLIPNormalizedConfig): + pass + + +class SiglipOnnxConfig(CLIPOnnxConfig): + NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, + # NOTE: No attention_mask + } + + +class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig): + pass + + +class SiglipTextOnnxConfig(CLIPTextOnnxConfig): + pass + + +class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 4db4130302d..7cb5a31d2d5 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -209,7 +209,12 @@ class TasksManager: "feature-extraction": "AutoModel", "fill-mask": "AutoModelForMaskedLM", "image-classification": "AutoModelForImageClassification", - "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), + "image-segmentation": ( + "AutoModelForImageSegmentation", + "AutoModelForSemanticSegmentation", + "AutoModelForInstanceSegmentation", + "AutoModelForUniversalSegmentation", + ), "image-to-image": "AutoModelForImageToImage", "image-to-text": ("AutoModelForVision2Seq", "AutoModel"), "mask-generation": "AutoModel", @@ -224,6 +229,7 @@ class TasksManager: "text2text-generation": "AutoModelForSeq2SeqLM", "text-classification": "AutoModelForSequenceClassification", "token-classification": "AutoModelForTokenClassification", + "visual-question-answering": "AutoModelForVisualQuestionAnswering", "zero-shot-image-classification": "AutoModelForZeroShotImageClassification", "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection", } @@ -307,6 +313,7 @@ class TasksManager: "vision2seq-lm": "image-to-text", "zero-shot-classification": "text-classification", "image-feature-extraction": "feature-extraction", + "pretraining": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) "stable-diffusion": "text-to-image", @@ -601,6 +608,11 @@ class TasksManager: "image-segmentation", onnx="DetrOnnxConfig", ), + "dinov2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="Dinov2OnnxConfig", + ), "distilbert": supported_tasks_mapping( "feature-extraction", "fill-mask", @@ -732,6 +744,11 @@ class TasksManager: "feature-extraction", onnx="GroupViTOnnxConfig", ), + "hiera": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="HieraOnnxConfig", + ), "hubert": supported_tasks_mapping( "feature-extraction", "automatic-speech-recognition", @@ -813,6 +830,11 @@ class TasksManager: "question-answering", onnx="MarkupLMOnnxConfig", ), + "maskformer": supported_tasks_mapping( + "feature-extraction", + "image-segmentation", + onnx="MaskFormerOnnxConfig", + ), "mbart": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -1011,6 +1033,11 @@ class TasksManager: "image-classification", onnx="PoolFormerOnnxConfig", ), + "pvt": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="PvtOnnxConfig", + ), "regnet": supported_tasks_mapping( "feature-extraction", "image-classification", @@ -1070,6 +1097,23 @@ class TasksManager: "audio-classification", onnx="SEWDOnnxConfig", ), + "siglip": supported_tasks_mapping( + "feature-extraction", + "zero-shot-image-classification", + onnx="SiglipOnnxConfig", + ), + "siglip-text-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextOnnxConfig", + ), + "siglip-text-with-projection": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextWithProjectionOnnxConfig", + ), + "siglip-vision-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipVisionModelOnnxConfig", + ), "speech-to-text": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -1102,6 +1146,12 @@ class TasksManager: "masked-im", onnx="SwinOnnxConfig", ), + "swinv2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + "masked-im", + onnx="SwinV2OnnxConfig", + ), "swin2sr": supported_tasks_mapping( "feature-extraction", "image-to-image", @@ -1148,7 +1198,19 @@ class TasksManager: onnx="VisionEncoderDecoderOnnxConfig", ), "vit": supported_tasks_mapping( - "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig" + "feature-extraction", + "image-classification", + "masked-im", + onnx="ViTOnnxConfig", + ), + "vit-mae": supported_tasks_mapping( + "feature-extraction", + onnx="VitMAEOnnxConfig", + ), + "vit-msn": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="VitMSNOnnxConfig", ), "vits": supported_tasks_mapping( "text-to-audio", @@ -1232,6 +1294,10 @@ class TasksManager: "unet-2d-condition", "vae-encoder", "vae-decoder", + "clip-text-model", + "clip-text-with-projection", + "siglip-text-model", + "siglip-text-with-projection", # redundant model types "trocr", # same as vision-encoder-decoder } diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 8e5a814b689..a55eb064fa3 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -1696,7 +1696,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForImageClassification(ORTModel): """ - ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit. + ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, dinov2, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, swinv2, vit. """ auto_model_class = AutoModelForImageClassification @@ -1784,7 +1784,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForSemanticSegmentation(ORTModel): """ - ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports segformer. + ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports maskformer, segformer. """ auto_model_class = AutoModelForSemanticSegmentation diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 9e92e0bd325..79375d958ff 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -178,6 +178,7 @@ def check_optimization_supported_model(cls, model_type: str, optimization_config "clip", "vit", "swin", + "swinv2", ] model_type = model_type.replace("_", "-") if (model_type not in cls._conf) or (cls._conf[model_type] not in supported_model_types_for_optimization): diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 9ceed24c2dd..9fde2bd4696 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -204,8 +204,10 @@ class NormalizedConfigManager: 'data2vec-text', 'data2vec-vision', 'detr', + 'dinov2', 'flaubert', 'groupvit', + 'hiera', 'ibert', 'layoutlm', 'layoutlmv3', @@ -216,6 +218,8 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', + 'segformer', + 'siglip', 'squeezebert', 'table-transformer', """ diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index e04a850bc8c..900b5f3b5ce 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -69,6 +69,7 @@ "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium", "deit": "hf-internal-testing/tiny-random-DeiTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", "detr": "hf-internal-testing/tiny-random-DetrModel", # hf-internal-testing/tiny-random-detr is larger @@ -103,6 +104,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "ibert": "hf-internal-testing/tiny-random-IBertModel", "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel", "levit": "hf-internal-testing/tiny-random-LevitModel", @@ -115,6 +117,7 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation", "mbart": "hf-internal-testing/tiny-random-mbart", "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", @@ -143,6 +146,7 @@ # "rembert": "google/rembert", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "regnet": "hf-internal-testing/tiny-random-RegNetModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -150,13 +154,18 @@ "roformer": "hf-internal-testing/tiny-random-RoFormerModel", "sam": "fxmarty/sam-vit-tiny-random", "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", + "siglip-vision-model": "hf-internal-testing/tiny-random-SiglipVisionModel", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRModel", "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel", + "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification", "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken @@ -237,6 +246,7 @@ "gpt-neox": "EleutherAI/gpt-neox-20b", "gptj": "anton-l/gpt-j-tiny-random", # TODO "groupvit": "nvidia/groupvit-gcc-yfcc", + "hiera": "facebook/hiera-tiny-224-in1k-hf", "ibert": "kssteven/ibert-roberta-base", "imagegpt": "openai/imagegpt-small", "levit": "facebook/levit-128S", @@ -249,6 +259,7 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", # Not using facebook/m2m100_418M because it takes too much time for testing. "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "facebook/maskformer-swin-tiny-coco", "mbart": "sshleifer/tiny-mbart", "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", @@ -264,18 +275,23 @@ "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "regnet": "facebook/regnet-y-040", "resnet": "microsoft/resnet-50", "roberta": "roberta-base", "roformer": "junnyu/roformer_chinese_base", "sam": "facebook/sam-vit-base", "segformer": "nvidia/segformer-b0-finetuned-ade-512-512", + "siglip": "google/siglip-base-patch16-224", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "squeezebert/squeezebert-uncased", "swin": "microsoft/swin-tiny-patch4-window7-224", + "swinv2": "microsoft/swinv2-tiny-patch4-window16-256", "t5": "t5-small", "table-transformer": "microsoft/table-transformer-detection", "vit": "google/vit-base-patch16-224", + "vit-mae": "facebook/vit-mae-base", + "vit-msn": "facebook/vit-msn-small", "yolos": "hustvl/yolos-tiny", "whisper": "openai/whisper-tiny.en", "hubert": "facebook/hubert-base-ls960", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 8f52ef45180..255c0d9d0e7 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2827,6 +2827,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin): "convnextv2", "data2vec_vision", "deit", + "dinov2", "levit", "mobilenet_v1", "mobilenet_v2", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index c33c07fc7b1..02ced3be3aa 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -87,8 +87,9 @@ "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "detr": "hf-internal-testing/tiny-random-detr", - "dpt": "hf-internal-testing/tiny-random-DPTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", + "dpt": "hf-internal-testing/tiny-random-DPTModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "encoder-decoder": { "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ @@ -107,6 +108,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", "latent-consistency": "echarlaix/tiny-random-latent-consistency", @@ -135,6 +137,7 @@ "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -143,12 +146,14 @@ "segformer": "hf-internal-testing/tiny-random-SegformerModel", "sew": "hf-internal-testing/tiny-random-SEWModel", "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution", "t5": "hf-internal-testing/tiny-random-t5",