From 2dd5209d520ec06f1030650f0b1c8989a1974a09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:48:05 +0200
Subject: [PATCH 01/16] wip

---
 optimum/exporters/onnx/model_configs.py |  5 ++++
 optimum/exporters/tasks.py              | 36 ++++++++++++++++---------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 401d995fdc7..499d4f6f03d 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1143,6 +1143,11 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs["last_hidden_state"][1] = f"{common_outputs['last_hidden_state'][1]} / 2"
         return common_outputs
 
+class SpeechT5OnnxConfig():
+    NORMALIZED_CONFIG_CLASS =
+
+    
+
 
 class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator):
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 5882972d758..99ba5f1f6e1 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -159,26 +159,27 @@ class TasksManager:
         # task in a Hub repo that has no pipeline_tag, and no transformersInfo.pipeline_tag, as we then rely on
         # on transformersInfo["auto_model"] and this dictionary.
         _TRANSFORMERS_TASKS_TO_MODEL_LOADERS = {
+            "audio-classification": "AutoModelForAudioClassification",
+            "audio-frame-classification": "AutoModelForAudioFrameClassification",
+            "audio-xvector": "AutoModelForAudioXVector",
+            "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
             "conversational": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
             "feature-extraction": "AutoModel",
             "fill-mask": "AutoModelForMaskedLM",
-            "text-generation": "AutoModelForCausalLM",
-            "text2text-generation": "AutoModelForSeq2SeqLM",
-            "text-classification": "AutoModelForSequenceClassification",
-            "token-classification": "AutoModelForTokenClassification",
-            "multiple-choice": "AutoModelForMultipleChoice",
-            "object-detection": "AutoModelForObjectDetection",
-            "question-answering": "AutoModelForQuestionAnswering",
             "image-classification": "AutoModelForImageClassification",
             "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"),
+            "image-to-text": "AutoModelForVision2Seq",
             "mask-generation": "AutoModel",
             "masked-im": "AutoModelForMaskedImageModeling",
+            "multiple-choice": "AutoModelForMultipleChoice",
+            "object-detection": "AutoModelForObjectDetection",
+            "question-answering": "AutoModelForQuestionAnswering",
             "semantic-segmentation": "AutoModelForSemanticSegmentation",
-            "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
-            "audio-classification": "AutoModelForAudioClassification",
-            "audio-frame-classification": "AutoModelForAudioFrameClassification",
-            "audio-xvector": "AutoModelForAudioXVector",
-            "image-to-text": "AutoModelForVision2Seq",
+            "text-to-speech": "AutoModelForTextToSpectrogram",
+            "text-generation": "AutoModelForCausalLM",
+            "text2text-generation": "AutoModelForSeq2SeqLM",
+            "text-classification": "AutoModelForSequenceClassification",
+            "token-classification": "AutoModelForTokenClassification",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
@@ -264,6 +265,8 @@ class TasksManager:
         ("pt", "visual-bert", "question-answering"): ("transformers", "VisualBertForQuestionAnswering"),
         # VisionEncoderDecoderModel is not registered in AutoModelForDocumentQuestionAnswering
         ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"),
+        # audio-to-audio task has no AutoModel class.
+        ("pt", "speecht5", "audio-to-audio"): ("transformers", "SpeechT5ForSpeechToSpeech"),
     }
 
     # TODO: why feature-extraction-with-past is here?
@@ -838,6 +841,15 @@ class TasksManager:
             "automatic-speech-recognition-with-past",
             onnx="Speech2TextOnnxConfig",
         ),
+        "speecht5": supported_tasks_mapping(
+            "audio-to-audio",
+            "audio-to-audio-with-past",
+            "automatic-speech-recognition",
+            "automatic-speech-recognition-with-past",
+            "text-to-speech",
+            "text-to-speech-with-past",
+            onnx="SpeechT5OnnxConfig",
+        ),
         "splinter": supported_tasks_mapping(
             "feature-extraction",
             "question-answering",

From be26f711926c99605f303fc3059b146edddf5617 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:47:39 +0200
Subject: [PATCH 02/16] wip bis

---
 optimum/exporters/onnx/__main__.py      |   6 ++
 optimum/exporters/onnx/model_configs.py | 114 +++++++++++++++++++++++-
 optimum/exporters/onnx/model_patcher.py | 106 +++++++++++++++++++++-
 optimum/exporters/onnx/utils.py         |  46 +++++++++-
 optimum/exporters/tasks.py              |   8 +-
 5 files changed, 265 insertions(+), 15 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 16a18afc552..8b93d487dde 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -38,6 +38,7 @@
     get_decoder_models_for_export,
     get_encoder_decoder_models_for_export,
     get_sam_models_for_export,
+    get_speecht5_models_for_export,
     get_stable_diffusion_models_for_export,
 )
 
@@ -69,6 +70,7 @@ def _get_submodels_and_onnx_configs(
     fn_get_submodels: Optional[Callable] = None,
     preprocessors: Optional[List[Any]] = None,
     no_position_ids: bool = False,
+    model_kwargs: Optional[Dict] = None,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     if not custom_architecture:
@@ -99,6 +101,7 @@ def _get_submodels_and_onnx_configs(
             )
             logger.info(f"Using the export variant {onnx_config.variant}. Available variants are:\n{all_variants}")
 
+            # TODO: this succession of if/else strongly suggests a refactor is needed.
             if (
                 model.config.is_encoder_decoder
                 and task.startswith(TasksManager._ENCODER_DECODER_TASKS)
@@ -109,6 +112,8 @@ def _get_submodels_and_onnx_configs(
                 models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
             elif model.config.model_type == "sam":
                 models_and_onnx_configs = get_sam_models_for_export(model, onnx_config)
+            elif model.config.model_type == "speecht5":
+                models_and_onnx_configs = get_speecht5_models_for_export(model, onnx_config, model_kwargs)
             else:
                 models_and_onnx_configs = {"model": (model, onnx_config)}
 
@@ -425,6 +430,7 @@ def main_export(
         preprocessors=preprocessors,
         _variant=_variant,
         no_position_ids=no_position_ids,
+        model_kwargs=model_kwargs,
     )
 
     if not is_stable_diffusion:
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 499d4f6f03d..febb4c40073 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -55,7 +55,7 @@
     TextSeq2SeqOnnxConfig,
     VisionOnnxConfig,
 )
-from .model_patcher import SAMModelPatcher, WavLMModelPatcher
+from .model_patcher import SAMModelPatcher, SpeechT5ModelPatcher, WavLMModelPatcher
 
 
 if TYPE_CHECKING:
@@ -1143,10 +1143,116 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs["last_hidden_state"][1] = f"{common_outputs['last_hidden_state'][1]} / 2"
         return common_outputs
 
-class SpeechT5OnnxConfig():
-    NORMALIZED_CONFIG_CLASS =
 
-    
+class DummySpeechT5InputGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("output_sequence", "speaker_embeddings", "spectrogram")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = 1  # TODO: SpeechT5 does not support batch inference in Transformers for now.
+
+        self.sequence_length = sequence_length
+        self.speaker_embedding_dim = normalized_config.speaker_embedding_dim
+        self.num_mel_bins = normalized_config.speaker_embedding_dim
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "output_sequence":
+            shape = [self.batch_size, self.sequence_length, self.num_mel_bins]
+        elif input_name == "speaker_embeddings":
+            shape = [self.batch_size, self.speaker_embedding_dim]
+        elif input_name == "spectrogram":
+            shape = [20, self.num_mel_bins]  # NOTE: the first axis length is arbitrary and dynamic
+        else:
+            raise ValueError(f"Unsupported input {input_name} for DummySpeechT5InputGenerator")
+
+        return self.random_float_tensor(
+            shape=shape,
+            min_value=0,
+            max_value=1,
+            framework=framework,
+            dtype=float_dtype,
+        )
+
+
+class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+    # TODO: Transformers batched generation for Speecht5 is BROKEN (https://github.com/huggingface/transformers/pull/25943),
+    # so we won't support for now.
+    NORMALIZED_CONFIG_CLASS = None
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTextInputGenerator,
+        DummySeq2SeqDecoderTextInputGenerator,
+        T5DummySeq2SeqPastKeyValuesGenerator,
+    )
+    DUMMY_PKV_GENERATOR_CLASS = T5DummySeq2SeqPastKeyValuesGenerator
+
+    # TODO: DO NOT CUT OUTPUT_SEQUENCE LENGTH WITH PAST!!!!!
+
+    VARIANTS = {
+        "transformers-like": "The following components are exported following Transformers implementation:\n\t - encoder_model.onnx: corresponds to the encoding part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2544-L2556.\n\t - decoder_model.onnx: corresponds to the decoder part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2572-L2602.\n\t - decoder_with_past_model.onnx: same as the above, with past_key_values input (KV cache filled).\n\t - decoder_postnet_and_vocoder.onnx: Decoder speech postnet and vocoder (e.g. a SpeechT5HifiGan) to generate speech from the spectrogram, as in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2605-L2614.",
+        "without-cache": "The same as `transformers-like`, without KV cache support. This is not a recommende export as slower than `transformers-like`.",
+    }
+    DEFAULT_VARIANT = "transformers-like"
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = {}
+
+        # Batched inference is not supported in Transformers.
+        if self._behavior is ConfigBehavior.ENCODER:
+            common_inputs["input_ids"] = {1: "encoder_sequence_length"}
+        elif self._behavior is ConfigBehavior.DECODER:
+            # NOTE: even when past is used, the decoder takes the full sequence as input as the prenet seem to require it:
+            # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2573
+            common_inputs["output_sequence"] = {1: "decoder_sequence_length"}
+            common_inputs["speaker_embeddings"] = {}  # No dynamic shape here.
+            common_inputs["encoder_hidden_states"] = {1: "encoder_sequence_length"}
+            common_inputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
+
+            if self.variant == "transformers-like" and self.use_past_in_inputs:
+                # TODO: check PKV shape
+                self.add_past_key_values(common_inputs, direction="inputs")
+        elif self.is_postnet_and_vocoder:
+            common_inputs["spectrogram"] = {0: "n_spectrums x reduction_factor"}
+        else:
+            raise ValueError(
+                "self._behavior is neither encoder or decoder, and is_postnet_and_vocoder=False. This should not happen."
+            )
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = {}
+        if self._behavior is ConfigBehavior.ENCODER:
+            common_outputs["encoder_hidden_states"] = {1: "encoder_sequence_length"}
+            common_outputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
+        elif self._behavior is ConfigBehavior.DECODER:
+            common_outputs["output_sequence"] = {1: "decoder_sequence_length + 1"}
+            common_outputs["prob"] = {}  # No dynamic shape here.
+            common_outputs["spectrum"] = {}  # No dynamic shape here.
+
+            if self.variant == "transformers-like" and self.use_past:
+                # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
+                self.add_past_key_values(common_outputs, direction="outputs")
+        elif self.is_postnet_and_vocoder:
+            common_outputs["waveform"] = {0: "n_samples"}
+        else:
+            raise ValueError(
+                "self._behavior is neither encoder or decoder, and is_postnet_and_vocoder=False. This should not happen."
+            )
+
+        return common_outputs
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return SpeechT5ModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator):
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index e6b50b6dc08..b9abe29421a 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -15,8 +15,9 @@
 import dataclasses
 import functools
 import inspect
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
 from transformers.utils import is_torch_available
 
 
@@ -34,6 +35,18 @@
 logger = logging.get_logger(__name__)
 
 
+def get_argument(argument_name: str, args: List[Any], kwargs: Dict[str, Any], forward_signature):
+    """
+    Get the argument argument_name from the args and kwargs according to the signature forward_signature.
+    """
+    args = list(args)
+    if argument_name in forward_signature.parameters:
+        argument_index = list(forward_signature.parameters.keys()).index(argument_name)
+        return args[argument_index]
+    else:
+        return kwargs[argument_name]
+
+
 def override_arguments(args, kwargs, forward_signature, model_kwargs: Dict[str, Any]):
     """
     Override the args and kwargs with the argument values from model_kwargs, following the signature forward_signature corresponding to args and kwargs.
@@ -286,9 +299,7 @@ def patched_forward(
                     **kwargs,
                 )
             elif config.variant == "split":
-                # return_dict = get_argument(args, kwargs, signature, "return_dict")
                 if config.vision_encoder:
-                    # pixel_values = get_argument(args, kwargs, signature, "pixel_values")
                     image_positional_embeddings = model.get_image_wide_positional_embeddings()
 
                     # repeat with batch size
@@ -342,3 +353,92 @@ def patched_forward(
                         return {"iou_scores": iou_predictions, "pred_masks": low_res_masks}
 
         self.patched_forward = patched_forward
+
+
+class SpeechT5ModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        def patched_forward(
+            input_ids=None,
+            speaker_embeddings=None,
+            encoder_outputs=None,
+            past_key_values=None,
+            output_sequence=None,
+            spectrogram=None,
+        ):
+            use_cache = self.real_config.use_past and self.real_config.variant == "transformers-like"
+            if self.real_config._behavior == "encoder":
+                encoder_attention_mask = torch.ones_like(input_ids)
+
+                encoder_out = model.speecht5.encoder(
+                    input_values=input_ids,
+                    attention_mask=encoder_attention_mask,
+                    return_dict=True,
+                )
+                # downsample encoder attention mask
+                if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
+                    encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
+                        encoder_out[0].shape[1], encoder_attention_mask
+                    )
+
+                # TODO: that is wrong?
+                return {"encoder_out": encoder_out, "encoder_attention_mask": encoder_attention_mask}
+
+            elif self.real_config._behavior == "decoder" and self.real_config.use_past_in_inputs:
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+
+                decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
+
+                # Run the decoder layers on the last element of the prenet output.
+                decoder_out = model.speecht5.decoder.wrapped_decoder(
+                    hidden_states=decoder_hidden_states[:, -1:],
+                    attention_mask=None,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=use_cache,
+                    output_attentions=False,
+                    return_dict=True,
+                )
+
+                last_decoder_output = decoder_out.last_hidden_state[0, -1]
+                past_key_values = decoder_out.past_key_values
+
+                # Predict the new mel spectrum for this step in the sequence.
+                spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
+                spectrum = spectrum.view(model.config.reduction_factor, model.config.num_mel_bins)
+
+                # NOTE: extending the spectrogram should is to be handled outside of the ONNX.
+                # spectrogram.append(spectrum)
+
+                # Extend the output sequence with the new mel spectrum.
+                output_sequence = torch.cat(
+                    (output_sequence, spectrum[-1].view(1, 1, model.config.num_mel_bins)), dim=1
+                )
+
+                # Predict the probability that this is the stop token.
+                prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
+
+                return {
+                    "prob": prob,
+                    "output_sequence": output_sequence,
+                    "spectrum": spectrum
+                    # TODO: PKV here
+                }
+            elif self.real_config.is_postnet_and_vocoder:
+                # spectrogram = torch.cat(spectrogram, dim=0).unsqueeze(0)
+                spectrogram = spectrogram.unsqueeze(0)
+                spectrogram = model.speech_decoder_postnet.postnet(spectrogram)
+                spectrogram = spectrogram.squeeze(0)
+
+                waveform = model_kwargs["vocoder"](spectrogram)
+
+                return {"waveform": waveform}
+
+        self.patched_forward = patched_forward
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 55850451aa7..a24cde52135 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -19,6 +19,7 @@
 
 import torch
 from packaging import version
+from transformers.models.speecht5.modeling_speecht5 import SpeechT5HifiGan
 from transformers.utils import is_tf_available, is_torch_available
 
 from ...utils import (
@@ -361,7 +362,7 @@ def _get_submodels_for_export_sam(model, variant):
     if variant == "monolith":
         models_for_export["model"] = model
     else:
-        # We use the model patcher to patch their forward method.
+        # We rather use the model patcher to patch their forward method.
         models_for_export["vision_encoder"] = model
         models_for_export["prompt_encoder_mask_decoder"] = model
 
@@ -390,6 +391,49 @@ def get_sam_models_for_export(model: Union["PreTrainedModel", "TFPreTrainedModel
     return models_for_export
 
 
+def get_speecht5_models_for_export(
+    model: Union["PreTrainedModel", "TFPreTrainedModel"], config: "OnnxConfig", model_kwargs: Optional[Dict]
+):
+    if model_kwargs is None or "vocoder" not in model_kwargs:
+        raise ValueError("The ONNX export of SpeechT5 requires the model_kwargs `vocoder` to be set.")
+
+    models_for_export = {}
+
+    # We rather use the model patcher to patch their forward method.
+    models_for_export["encoder_model"] = model
+    models_for_export["decoder_model"] = model
+
+    if config.variant == "transformers-like":
+        models_for_export["decoder_with_past_model"] = model
+
+    vocoder = SpeechT5HifiGan.from_pretrained(model_kwargs["vocoder"])
+    model_kwargs["vocoder_model"] = vocoder
+
+    models_for_export["decoder_postnet_and_vocoder"] = model
+
+    encoder_onnx_config = config.with_behavior("encoder")
+
+    use_past = config.variant == "transformers-like"
+    decoder_onnx_config = config.with_behavior("decoder", use_past=use_past, use_past_in_inputs=False)
+
+    models_for_export[ONNX_ENCODER_NAME] = (models_for_export[ONNX_ENCODER_NAME], encoder_onnx_config)
+    models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], decoder_onnx_config)
+    if config.variant == "transformers-like":
+        decoder_onnx_config_with_past = config.with_behavior("decoder", use_past=True, use_past_in_inputs=True)
+        models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
+            models_for_export[ONNX_DECODER_WITH_PAST_NAME],
+            decoder_onnx_config_with_past,
+        )
+
+    postnet_and_vocoder_onnx_config = config.__class__(..., is_vocoder=True)
+    models_for_export["decoder_postnet_and_vocoder"] = (
+        models_for_export["decoder_postnet_and_vocoder"],
+        postnet_and_vocoder_onnx_config,
+    )
+
+    return models_for_export
+
+
 def override_diffusers_2_0_attn_processors(model):
     for _, submodule in model.named_modules():
         if isinstance(submodule, Attention):
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 99ba5f1f6e1..aff43b07ad9 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -265,8 +265,6 @@ class TasksManager:
         ("pt", "visual-bert", "question-answering"): ("transformers", "VisualBertForQuestionAnswering"),
         # VisionEncoderDecoderModel is not registered in AutoModelForDocumentQuestionAnswering
         ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"),
-        # audio-to-audio task has no AutoModel class.
-        ("pt", "speecht5", "audio-to-audio"): ("transformers", "SpeechT5ForSpeechToSpeech"),
     }
 
     # TODO: why feature-extraction-with-past is here?
@@ -841,13 +839,9 @@ class TasksManager:
             "automatic-speech-recognition-with-past",
             onnx="Speech2TextOnnxConfig",
         ),
+        # TODO: SpeechT5 can also support audio-to-audio and automatic-speech-recognition.
         "speecht5": supported_tasks_mapping(
-            "audio-to-audio",
-            "audio-to-audio-with-past",
-            "automatic-speech-recognition",
-            "automatic-speech-recognition-with-past",
             "text-to-speech",
-            "text-to-speech-with-past",
             onnx="SpeechT5OnnxConfig",
         ),
         "splinter": supported_tasks_mapping(

From 02259a82eb5972559bfee58101dfa554609eff7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:52:34 +0200
Subject: [PATCH 03/16] nit

---
 optimum/exporters/onnx/model_configs.py | 10 +++++-----
 optimum/exporters/onnx/model_patcher.py |  3 ++-
 optimum/exporters/onnx/utils.py         |  6 +++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index febb4c40073..51d5823774f 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1194,10 +1194,10 @@ class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     # TODO: DO NOT CUT OUTPUT_SEQUENCE LENGTH WITH PAST!!!!!
 
     VARIANTS = {
-        "transformers-like": "The following components are exported following Transformers implementation:\n\t - encoder_model.onnx: corresponds to the encoding part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2544-L2556.\n\t - decoder_model.onnx: corresponds to the decoder part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2572-L2602.\n\t - decoder_with_past_model.onnx: same as the above, with past_key_values input (KV cache filled).\n\t - decoder_postnet_and_vocoder.onnx: Decoder speech postnet and vocoder (e.g. a SpeechT5HifiGan) to generate speech from the spectrogram, as in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2605-L2614.",
-        "without-cache": "The same as `transformers-like`, without KV cache support. This is not a recommende export as slower than `transformers-like`.",
+        "with-past": "The export follows the Transformers implementation using the KV cache, with the following components exported:\n\t - encoder_model.onnx: corresponds to the encoding part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2544-L2556.\n\t - decoder_model.onnx: corresponds to the decoder part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2572-L2602.\n\t - decoder_with_past_model.onnx: same as the above, with past_key_values input (KV cache filled).\n\t - decoder_postnet_and_vocoder.onnx: Decoder speech postnet and vocoder (e.g. a SpeechT5HifiGan) to generate speech from the spectrogram, as in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2605-L2614.",
+        "without-past": "The same as `with-past`, just without KV cache support. This is not a recommended export as slower than `with-past`.",
     }
-    DEFAULT_VARIANT = "transformers-like"
+    DEFAULT_VARIANT = "with-past"
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -1214,7 +1214,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
             common_inputs["encoder_hidden_states"] = {1: "encoder_sequence_length"}
             common_inputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
 
-            if self.variant == "transformers-like" and self.use_past_in_inputs:
+            if self.variant == "with-past" and self.use_past_in_inputs:
                 # TODO: check PKV shape
                 self.add_past_key_values(common_inputs, direction="inputs")
         elif self.is_postnet_and_vocoder:
@@ -1237,7 +1237,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs["prob"] = {}  # No dynamic shape here.
             common_outputs["spectrum"] = {}  # No dynamic shape here.
 
-            if self.variant == "transformers-like" and self.use_past:
+            if self.variant == "with-past" and self.use_past:
                 # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
                 self.add_past_key_values(common_outputs, direction="outputs")
         elif self.is_postnet_and_vocoder:
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index b9abe29421a..33d92ebb6b6 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -372,7 +372,7 @@ def patched_forward(
             output_sequence=None,
             spectrogram=None,
         ):
-            use_cache = self.real_config.use_past and self.real_config.variant == "transformers-like"
+            use_cache = self.real_config.use_past and self.real_config.variant == "with-past"
             if self.real_config._behavior == "encoder":
                 encoder_attention_mask = torch.ones_like(input_ids)
 
@@ -432,6 +432,7 @@ def patched_forward(
                     # TODO: PKV here
                 }
             elif self.real_config.is_postnet_and_vocoder:
+                # NOTE: the following concatenation is expected to be handled outside of the ONNX:
                 # spectrogram = torch.cat(spectrogram, dim=0).unsqueeze(0)
                 spectrogram = spectrogram.unsqueeze(0)
                 spectrogram = model.speech_decoder_postnet.postnet(spectrogram)
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index a24cde52135..1ae682cce9f 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -403,7 +403,7 @@ def get_speecht5_models_for_export(
     models_for_export["encoder_model"] = model
     models_for_export["decoder_model"] = model
 
-    if config.variant == "transformers-like":
+    if config.variant == "with-past":
         models_for_export["decoder_with_past_model"] = model
 
     vocoder = SpeechT5HifiGan.from_pretrained(model_kwargs["vocoder"])
@@ -413,12 +413,12 @@ def get_speecht5_models_for_export(
 
     encoder_onnx_config = config.with_behavior("encoder")
 
-    use_past = config.variant == "transformers-like"
+    use_past = config.variant == "with-past"
     decoder_onnx_config = config.with_behavior("decoder", use_past=use_past, use_past_in_inputs=False)
 
     models_for_export[ONNX_ENCODER_NAME] = (models_for_export[ONNX_ENCODER_NAME], encoder_onnx_config)
     models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], decoder_onnx_config)
-    if config.variant == "transformers-like":
+    if config.variant == "with-past":
         decoder_onnx_config_with_past = config.with_behavior("decoder", use_past=True, use_past_in_inputs=True)
         models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
             models_for_export[ONNX_DECODER_WITH_PAST_NAME],

From d181ad22e482b8e26ca172981f48591957f7acdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:56:27 +0200
Subject: [PATCH 04/16] nit^2

---
 optimum/exporters/onnx/model_configs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 51d5823774f..d339f565207 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -56,13 +56,12 @@
     VisionOnnxConfig,
 )
 from .model_patcher import SAMModelPatcher, SpeechT5ModelPatcher, WavLMModelPatcher
-
+from ...utils import DummyInputGenerator
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
     from transformers.modeling_utils import PreTrainedModel
 
-    from ...utils import DummyInputGenerator
     from .model_patcher import ModelPatcher
 
     if is_tf_available():

From 54d3bc7aefd55270ad468e93c71433b93793260c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 16:18:15 +0200
Subject: [PATCH 05/16] working export

---
 optimum/commands/export/onnx.py         |  7 +++
 optimum/exporters/onnx/__main__.py      |  2 +-
 optimum/exporters/onnx/base.py          |  8 +++-
 optimum/exporters/onnx/convert.py       | 39 ++++++++---------
 optimum/exporters/onnx/model_configs.py | 57 ++++++++++++++++++++++---
 optimum/exporters/onnx/model_patcher.py | 22 +++++++---
 optimum/exporters/onnx/utils.py         | 19 +++++++--
 optimum/utils/input_generators.py       |  1 +
 8 files changed, 116 insertions(+), 39 deletions(-)

diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
index d496f6f0392..a9ccae15375 100644
--- a/optimum/commands/export/onnx.py
+++ b/optimum/commands/export/onnx.py
@@ -14,6 +14,7 @@
 """Defines the command line for the export with ONNX."""
 
 import argparse
+import json
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -143,6 +144,11 @@ def parse_args_onnx(parser):
             "Disable the use of position_ids for text-generation models that require it for batched generation. This argument is introduced for backward compatibility and will be removed in a future release of Optimum."
         ),
     )
+    optional_group.add_argument(
+        "--model-kwargs",
+        type=json.loads,
+        help=("Any kwargs passed to the model forward, or used to customize the export for a given model."),
+    )
 
     input_group = parser.add_argument_group(
         "Input shapes (if necessary, this allows to override the shapes of the input given to the ONNX exporter, that requires an example input)."
@@ -256,5 +262,6 @@ def run(self):
             _variant=self.args.variant,
             library_name=self.args.library_name,
             no_position_ids=self.args.no_position_ids,
+            model_kwargs=self.args.model_kwargs,
             **input_shapes,
         )
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 8b93d487dde..34c85173f80 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -97,7 +97,7 @@ def _get_submodels_and_onnx_configs(
 
             onnx_config.variant = _variant
             all_variants = "\n".join(
-                [f"\t- {name}: {description}" for name, description in onnx_config.VARIANTS.items()]
+                [f"    - {name}: {description}" for name, description in onnx_config.VARIANTS.items()]
             )
             logger.info(f"Using the export variant {onnx_config.variant}. Available variants are:\n{all_variants}")
 
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index 1e2ae99955c..ff645b3be2f 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -200,7 +200,8 @@ def __init__(
         int_dtype: str = "int64",
         float_dtype: str = "fp32",
     ):
-        if task not in self._TASK_TO_COMMON_OUTPUTS:
+        # Isn't this check useless?
+        if task not in self._TASK_TO_COMMON_OUTPUTS and task != "text-to-speech":
             raise ValueError(
                 f"{task} is not a supported task, supported tasks: {', '.join(self._TASK_TO_COMMON_OUTPUTS.keys())}"
             )
@@ -808,7 +809,8 @@ def with_behavior(
         """
         if isinstance(behavior, str) and not isinstance(behavior, ConfigBehavior):
             behavior = ConfigBehavior(behavior)
-        return self.__class__(
+
+        onnx_config = self.__class__(
             self._config,
             task=self.task,
             int_dtype=self.int_dtype,
@@ -818,6 +820,8 @@ def with_behavior(
             behavior=behavior,
             preprocessors=self._preprocessors,
         )
+        onnx_config.variant = self.variant
+        return onnx_config
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index f637da07804..821a39cb06b 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -560,25 +560,26 @@ def remap(value):
         if is_torch_less_than_1_11:
             raise RuntimeError("The ONNX export using the PyTorch framework is only supported for v1.11+")
         else:
-            with config.patch_model_for_export(model, model_kwargs=model_kwargs):
-                check_dummy_inputs_are_allowed(model, dummy_inputs)
-
-                inputs = config.ordered_inputs(model)
-                input_names = list(inputs.keys())
-                output_names = list(config.outputs.keys())
-
-                # Export can work with named args but the dict containing named args has to be the last element of the args
-                # tuple.
-                onnx_export(
-                    model,
-                    (dummy_inputs,),
-                    f=output.as_posix(),
-                    input_names=input_names,
-                    output_names=output_names,
-                    dynamic_axes=dict(chain(inputs.items(), config.outputs.items())),
-                    do_constant_folding=True,
-                    opset_version=opset,
-                )
+            with torch.no_grad():
+                with config.patch_model_for_export(model, model_kwargs=model_kwargs):
+                    check_dummy_inputs_are_allowed(model, dummy_inputs)
+
+                    inputs = config.ordered_inputs(model)
+                    input_names = list(inputs.keys())
+                    output_names = list(config.outputs.keys())
+
+                    # Export can work with named args but the dict containing named args has to be the last element of the args
+                    # tuple.
+                    onnx_export(
+                        model,
+                        (dummy_inputs,),
+                        f=output.as_posix(),
+                        input_names=input_names,
+                        output_names=output_names,
+                        dynamic_axes=dict(chain(inputs.items(), config.outputs.items())),
+                        do_constant_folding=True,
+                        opset_version=opset,
+                    )
 
             # check if external data was exported
             # TODO: this is quite inefficient as we load in memory if models are <2GB without external data
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d339f565207..bbffcec529f 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -24,6 +24,7 @@
     BloomDummyPastKeyValuesGenerator,
     DummyAudioInputGenerator,
     DummyDecoderTextInputGenerator,
+    DummyInputGenerator,
     DummyPastKeyValuesGenerator,
     DummyPix2StructInputGenerator,
     DummyPointsGenerator,
@@ -56,7 +57,7 @@
     VisionOnnxConfig,
 )
 from .model_patcher import SAMModelPatcher, SpeechT5ModelPatcher, WavLMModelPatcher
-from ...utils import DummyInputGenerator
+
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -1158,7 +1159,7 @@ def __init__(
 
         self.sequence_length = sequence_length
         self.speaker_embedding_dim = normalized_config.speaker_embedding_dim
-        self.num_mel_bins = normalized_config.speaker_embedding_dim
+        self.num_mel_bins = normalized_config.num_mel_bins
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "output_sequence":
@@ -1182,11 +1183,12 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     # TODO: Transformers batched generation for Speecht5 is BROKEN (https://github.com/huggingface/transformers/pull/25943),
     # so we won't support for now.
-    NORMALIZED_CONFIG_CLASS = None
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyTextInputGenerator,
         DummySeq2SeqDecoderTextInputGenerator,
         T5DummySeq2SeqPastKeyValuesGenerator,
+        DummySpeechT5InputGenerator,
     )
     DUMMY_PKV_GENERATOR_CLASS = T5DummySeq2SeqPastKeyValuesGenerator
 
@@ -1198,6 +1200,30 @@ class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     }
     DEFAULT_VARIANT = "with-past"
 
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        use_past: bool = False,
+        use_past_in_inputs: bool = False,
+        behavior: ConfigBehavior = ConfigBehavior.MONOLITH,
+        preprocessors: Optional[List[Any]] = None,
+        is_postnet_and_vocoder: bool = False,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            use_past=use_past,
+            use_past_in_inputs=use_past_in_inputs,
+            behavior=behavior,
+            preprocessors=preprocessors,
+        )
+        self.is_postnet_and_vocoder = is_postnet_and_vocoder
+
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = {}
@@ -1210,7 +1236,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
             # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2573
             common_inputs["output_sequence"] = {1: "decoder_sequence_length"}
             common_inputs["speaker_embeddings"] = {}  # No dynamic shape here.
-            common_inputs["encoder_hidden_states"] = {1: "encoder_sequence_length"}
+            common_inputs["encoder_outputs"] = {1: "encoder_sequence_length"}
             common_inputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
 
             if self.variant == "with-past" and self.use_past_in_inputs:
@@ -1229,12 +1255,12 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     def outputs(self) -> Dict[str, Dict[int, str]]:
         common_outputs = {}
         if self._behavior is ConfigBehavior.ENCODER:
-            common_outputs["encoder_hidden_states"] = {1: "encoder_sequence_length"}
+            common_outputs["encoder_outputs"] = {1: "encoder_sequence_length"}
             common_outputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
         elif self._behavior is ConfigBehavior.DECODER:
-            common_outputs["output_sequence"] = {1: "decoder_sequence_length + 1"}
-            common_outputs["prob"] = {}  # No dynamic shape here.
+            common_outputs["output_sequence_out"] = {1: "decoder_sequence_length + 1"}
             common_outputs["spectrum"] = {}  # No dynamic shape here.
+            common_outputs["prob"] = {}  # No dynamic shape here.
 
             if self.variant == "with-past" and self.use_past:
                 # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
@@ -1253,6 +1279,23 @@ def patch_model_for_export(
     ) -> "ModelPatcher":
         return SpeechT5ModelPatcher(self, model, model_kwargs=model_kwargs)
 
+    @property
+    def torch_to_onnx_input_map(self) -> Dict[str, str]:
+        return {
+            # "decoder_input_ids": "input_ids",
+            "encoder_outputs": "encoder_hidden_states",
+            # "attention_mask": "encoder_attention_mask",
+        }
+
+    def overwrite_shape_and_generate_input(
+        self, dummy_input_gen: "DummyInputGenerator", input_name: str, framework: str, input_shapes: Dict
+    ):
+        dummy_input_gen.batch_size = 1
+        dummy_input = dummy_input_gen.generate(
+            input_name, framework=framework, int_dtype=self.int_dtype, float_dtype=self.float_dtype
+        )
+        return dummy_input
+
 
 class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator):
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 33d92ebb6b6..9679159dd7a 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -364,6 +364,8 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
+        model.vocoder = model_kwargs["vocoder_model"]
+
         def patched_forward(
             input_ids=None,
             speaker_embeddings=None,
@@ -371,6 +373,7 @@ def patched_forward(
             past_key_values=None,
             output_sequence=None,
             spectrogram=None,
+            encoder_attention_mask=None,
         ):
             use_cache = self.real_config.use_past and self.real_config.variant == "with-past"
             if self.real_config._behavior == "encoder":
@@ -387,11 +390,14 @@ def patched_forward(
                         encoder_out[0].shape[1], encoder_attention_mask
                     )
 
-                # TODO: that is wrong?
-                return {"encoder_out": encoder_out, "encoder_attention_mask": encoder_attention_mask}
+                return {
+                    "encoder_outputs": encoder_out.last_hidden_state,
+                    "encoder_attention_mask": encoder_attention_mask,
+                }
 
-            elif self.real_config._behavior == "decoder" and self.real_config.use_past_in_inputs:
-                encoder_hidden_states = encoder_outputs.last_hidden_state
+            elif self.real_config._behavior == "decoder":
+                # TODO: and self.real_config.use_past_in_inputs
+                encoder_hidden_states = encoder_outputs[0]
 
                 decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
 
@@ -426,9 +432,9 @@ def patched_forward(
                 prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
 
                 return {
+                    "output_sequence_out": output_sequence,
+                    "spectrum": spectrum,
                     "prob": prob,
-                    "output_sequence": output_sequence,
-                    "spectrum": spectrum
                     # TODO: PKV here
                 }
             elif self.real_config.is_postnet_and_vocoder:
@@ -438,8 +444,10 @@ def patched_forward(
                 spectrogram = model.speech_decoder_postnet.postnet(spectrogram)
                 spectrogram = spectrogram.squeeze(0)
 
-                waveform = model_kwargs["vocoder"](spectrogram)
+                waveform = model.vocoder(spectrogram)
 
                 return {"waveform": waveform}
+            else:
+                raise ValueError("Should not happen")
 
         self.patched_forward = patched_forward
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 1ae682cce9f..aa743db3b44 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -395,7 +395,9 @@ def get_speecht5_models_for_export(
     model: Union["PreTrainedModel", "TFPreTrainedModel"], config: "OnnxConfig", model_kwargs: Optional[Dict]
 ):
     if model_kwargs is None or "vocoder" not in model_kwargs:
-        raise ValueError("The ONNX export of SpeechT5 requires the model_kwargs `vocoder` to be set.")
+        raise ValueError(
+            'The ONNX export of SpeechT5 requires a vocoder. Please pass `--model-kwargs \'{"vocoder": "vocoder_model_name_or_path"}\'` from the command line, or `model_kwargs={"vocoder": "vocoder_model_name_or_path"}` if calling main_export.'
+        )
 
     models_for_export = {}
 
@@ -406,7 +408,8 @@ def get_speecht5_models_for_export(
     if config.variant == "with-past":
         models_for_export["decoder_with_past_model"] = model
 
-    vocoder = SpeechT5HifiGan.from_pretrained(model_kwargs["vocoder"])
+    # TODO: more flexibility in the vocoder class?
+    vocoder = SpeechT5HifiGan.from_pretrained(model_kwargs["vocoder"]).eval()
     model_kwargs["vocoder_model"] = vocoder
 
     models_for_export["decoder_postnet_and_vocoder"] = model
@@ -425,7 +428,17 @@ def get_speecht5_models_for_export(
             decoder_onnx_config_with_past,
         )
 
-    postnet_and_vocoder_onnx_config = config.__class__(..., is_vocoder=True)
+    postnet_and_vocoder_onnx_config = config.__class__(
+        config._config,
+        task=config.task,
+        int_dtype=config.int_dtype,
+        float_dtype=config.float_dtype,
+        use_past=use_past,
+        use_past_in_inputs=False,  # Irrelevant here.
+        behavior=config._behavior,  # Irrelevant here.
+        preprocessors=config._preprocessors,
+        is_postnet_and_vocoder=True,
+    )
     models_for_export["decoder_postnet_and_vocoder"] = (
         models_for_export["decoder_postnet_and_vocoder"],
         postnet_and_vocoder_onnx_config,
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 227c12315d9..72bbb2e618f 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -323,6 +323,7 @@ class DummyTextInputGenerator(DummyInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "input_ids",
         "attention_mask",
+        "encoder_attention_mask",
         "token_type_ids",
         "position_ids",
     )

From b107b2d74c52d1575127b8d1d3ed65ef0334bf40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 21 Sep 2023 17:36:28 +0200
Subject: [PATCH 06/16] working with-past version

---
 optimum/exporters/onnx/convert.py       | 58 ++++++++++++-------------
 optimum/exporters/onnx/model_configs.py | 48 ++++++++++++++------
 optimum/exporters/onnx/model_patcher.py | 26 ++++++++---
 optimum/exporters/onnx/utils.py         |  1 +
 4 files changed, 86 insertions(+), 47 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 821a39cb06b..0b00667e6c8 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -38,6 +38,7 @@
 )
 from ..error_utils import AtolError, MinimumVersionError, OutputMatchError, ShapeError
 from .base import OnnxConfig
+from .model_configs import SpeechT5OnnxConfig
 from .utils import PickableInferenceSession, recursive_to_device
 
 
@@ -142,7 +143,6 @@ def validate_models_outputs(
     if use_subprocess:
         logger.info("Validating models in subprocesses...")
     exceptions = []  # run all validations before raising
-    onnx_paths = []
     for i, model_name in enumerate(models_and_onnx_configs.keys()):
         submodel, sub_onnx_config = models_and_onnx_configs[model_name]
         onnx_model_path = (
@@ -150,7 +150,6 @@ def validate_models_outputs(
             if onnx_files_subpaths is not None
             else output_dir.joinpath(model_name + ".onnx")
         )
-        onnx_paths.append(onnx_model_path)
         try:
             # Model validation is done in subprocesses, as ONNX Runtime has the bad habit of
             # not releasing memory once an InferenceSession is initialized.
@@ -168,12 +167,12 @@ def validate_models_outputs(
                 model_kwargs=model_kwargs,
             )
         except Exception as e:
-            exceptions.append(e)
+            exceptions.append((onnx_model_path, e))
 
     if len(exceptions) != 0:
         for i, exception in enumerate(exceptions[:-1]):
-            logger.error(f"Validation {i} for the model {onnx_paths[i].as_posix()} raised: {exception}")
-        raise exceptions[-1]
+            logger.error(f"Validation for the model {exception[0].as_posix()} raised: {exception[1]}")
+        raise exceptions[-1][1]
 
 
 def validate_model_outputs(
@@ -423,9 +422,11 @@ def _run_validation(
 
     if value_failures:
         msg = "\n".join(f"- {t[0]}: max diff = {t[1]}" for t in value_failures)
-        raise AtolError(
-            f"The maximum absolute difference between the output of the reference model and the ONNX exported model is not within the set tolerance {atol}:\n{msg}"
-        )
+        atol_msg = f"The maximum absolute difference between the output of the reference model and the ONNX exported model is not within the set tolerance {atol}:\n{msg}"
+
+        if isinstance(config, SpeechT5OnnxConfig):
+            atol_msg += "\nIMPORTANT NOTE: SpeechT5 uses a dropout at inference and the output validation of ONNX Runtime inference vs PyTorch is expected to fail. Reference: https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L727"
+        raise AtolError(atol_msg)
 
 
 class ValidationProcess(mp.Process):
@@ -526,7 +527,7 @@ def export_pytorch(
 
     with torch.no_grad():
         model.config.return_dict = True
-        model.eval()
+        model = model.eval()
 
         # Check if we need to override certain configuration item
         if config.values_override is not None:
@@ -560,26 +561,25 @@ def remap(value):
         if is_torch_less_than_1_11:
             raise RuntimeError("The ONNX export using the PyTorch framework is only supported for v1.11+")
         else:
-            with torch.no_grad():
-                with config.patch_model_for_export(model, model_kwargs=model_kwargs):
-                    check_dummy_inputs_are_allowed(model, dummy_inputs)
-
-                    inputs = config.ordered_inputs(model)
-                    input_names = list(inputs.keys())
-                    output_names = list(config.outputs.keys())
-
-                    # Export can work with named args but the dict containing named args has to be the last element of the args
-                    # tuple.
-                    onnx_export(
-                        model,
-                        (dummy_inputs,),
-                        f=output.as_posix(),
-                        input_names=input_names,
-                        output_names=output_names,
-                        dynamic_axes=dict(chain(inputs.items(), config.outputs.items())),
-                        do_constant_folding=True,
-                        opset_version=opset,
-                    )
+            with config.patch_model_for_export(model, model_kwargs=model_kwargs):
+                check_dummy_inputs_are_allowed(model, dummy_inputs)
+
+                inputs = config.ordered_inputs(model)
+                input_names = list(inputs.keys())
+                output_names = list(config.outputs.keys())
+
+                # Export can work with named args but the dict containing named args has to be the last element of the args
+                # tuple.
+                onnx_export(
+                    model,
+                    (dummy_inputs,),
+                    f=output.as_posix(),
+                    input_names=input_names,
+                    output_names=output_names,
+                    dynamic_axes=dict(chain(inputs.items(), config.outputs.items())),
+                    do_constant_folding=True,
+                    opset_version=opset,
+                )
 
             # check if external data was exported
             # TODO: this is quite inefficient as we load in memory if models are <2GB without external data
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index bbffcec529f..d3ff9944b92 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -327,8 +327,6 @@ class T5OnnxConfig(TextSeq2SeqOnnxConfig):
         num_attention_heads="num_heads",
         encoder_num_layers="num_layers",
         decoder_num_layers="num_decoder_layers",
-        key_value_dim="d_kv",
-        allow_new=True,
     )
 
     def generate_dummy_inputs_for_validation(
@@ -1183,16 +1181,22 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     # TODO: Transformers batched generation for Speecht5 is BROKEN (https://github.com/huggingface/transformers/pull/25943),
     # so we won't support for now.
-    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(decoder_num_layers="decoder_layers")
+    NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
+        hidden_size="hidden_size",
+        num_attention_heads="encoder_attention_heads",  # TODO: bugged in case encoder and decoder have different number of heads
+        encoder_num_layers="encoder_layers",
+        decoder_num_layers="decoder_layers",
+        allow_new=True,
+    )
+
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyTextInputGenerator,
         DummySeq2SeqDecoderTextInputGenerator,
-        T5DummySeq2SeqPastKeyValuesGenerator,
+        DummySeq2SeqPastKeyValuesGenerator,
         DummySpeechT5InputGenerator,
     )
-    DUMMY_PKV_GENERATOR_CLASS = T5DummySeq2SeqPastKeyValuesGenerator
-
-    # TODO: DO NOT CUT OUTPUT_SEQUENCE LENGTH WITH PAST!!!!!
+    DUMMY_PKV_GENERATOR_CLASS = DummySeq2SeqPastKeyValuesGenerator
 
     VARIANTS = {
         "with-past": "The export follows the Transformers implementation using the KV cache, with the following components exported:\n\t - encoder_model.onnx: corresponds to the encoding part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2544-L2556.\n\t - decoder_model.onnx: corresponds to the decoder part in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2572-L2602.\n\t - decoder_with_past_model.onnx: same as the above, with past_key_values input (KV cache filled).\n\t - decoder_postnet_and_vocoder.onnx: Decoder speech postnet and vocoder (e.g. a SpeechT5HifiGan) to generate speech from the spectrogram, as in https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2605-L2614.",
@@ -1240,7 +1244,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
             common_inputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
 
             if self.variant == "with-past" and self.use_past_in_inputs:
-                # TODO: check PKV shape
                 self.add_past_key_values(common_inputs, direction="inputs")
         elif self.is_postnet_and_vocoder:
             common_inputs["spectrogram"] = {0: "n_spectrums x reduction_factor"}
@@ -1281,11 +1284,7 @@ def patch_model_for_export(
 
     @property
     def torch_to_onnx_input_map(self) -> Dict[str, str]:
-        return {
-            # "decoder_input_ids": "input_ids",
-            "encoder_outputs": "encoder_hidden_states",
-            # "attention_mask": "encoder_attention_mask",
-        }
+        return {"encoder_outputs": "encoder_hidden_states"}
 
     def overwrite_shape_and_generate_input(
         self, dummy_input_gen: "DummyInputGenerator", input_name: str, framework: str, input_shapes: Dict
@@ -1296,6 +1295,29 @@ def overwrite_shape_and_generate_input(
         )
         return dummy_input
 
+    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        if direction == "inputs":
+            decoder_sequence_name = "past_decoder_sequence_length"
+            name = "past_key_values"
+        else:
+            decoder_sequence_name = "past_decoder_sequence_length + 1"
+            name = "present"
+
+        for i in range(self._normalized_config.decoder_num_layers):
+            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {2: decoder_sequence_name}
+            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {2: decoder_sequence_name}
+
+            if (
+                self.is_merged is True
+                or (self._behavior is ConfigBehavior.DECODER and not self.use_past_in_inputs)
+                or direction == "inputs"
+            ):
+                inputs_or_outputs[f"{name}.{i}.encoder.key"] = {2: "encoder_sequence_length_out"}
+                inputs_or_outputs[f"{name}.{i}.encoder.value"] = {2: "encoder_sequence_length_out"}
+
 
 class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator):
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 9679159dd7a..63edefb63cc 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -364,7 +364,7 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
-        model.vocoder = model_kwargs["vocoder_model"]
+        model.vocoder = model_kwargs["vocoder_model"].eval()
 
         def patched_forward(
             input_ids=None,
@@ -390,7 +390,7 @@ def patched_forward(
                         encoder_out[0].shape[1], encoder_attention_mask
                     )
 
-                return {
+                result = {
                     "encoder_outputs": encoder_out.last_hidden_state,
                     "encoder_attention_mask": encoder_attention_mask,
                 }
@@ -431,11 +431,11 @@ def patched_forward(
                 # Predict the probability that this is the stop token.
                 prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
 
-                return {
+                result = {
                     "output_sequence_out": output_sequence,
                     "spectrum": spectrum,
                     "prob": prob,
-                    # TODO: PKV here
+                    "past_key_values": past_key_values,
                 }
             elif self.real_config.is_postnet_and_vocoder:
                 # NOTE: the following concatenation is expected to be handled outside of the ONNX:
@@ -446,8 +446,24 @@ def patched_forward(
 
                 waveform = model.vocoder(spectrogram)
 
-                return {"waveform": waveform}
+                result = {"waveform": waveform}
             else:
                 raise ValueError("Should not happen")
 
+            # Filter out cross attention past key values output from the decoder using KV cache, as they are constants.
+            filterd_outputs = {}
+            for name, value in result.items():
+                if name != "past_key_values":
+                    filterd_outputs[name] = value
+                else:
+                    if self.real_config._behavior == "decoder" and (
+                        self.real_config.is_merged or not self.real_config.use_past_in_inputs
+                    ):
+                        filterd_outputs[name] = value
+                    elif self.real_config._behavior == "decoder" and self.real_config.use_past_in_inputs:
+                        # The filtering happens here. The decoder with use_past_in_inputs=True corresponds to the autoregressive one.
+                        filterd_outputs[name] = tuple([v[:2] for v in value])
+
+            return filterd_outputs
+
         self.patched_forward = patched_forward
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index aa743db3b44..25c50a36dcc 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -439,6 +439,7 @@ def get_speecht5_models_for_export(
         preprocessors=config._preprocessors,
         is_postnet_and_vocoder=True,
     )
+    postnet_and_vocoder_onnx_config.variant = config.variant
     models_for_export["decoder_postnet_and_vocoder"] = (
         models_for_export["decoder_postnet_and_vocoder"],
         postnet_and_vocoder_onnx_config,

From f8f69ab02c2e8a741c429baf58453434aa9380cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 26 Sep 2023 17:35:12 +0200
Subject: [PATCH 07/16] add test

---
 optimum/exporters/onnx/__main__.py            | 15 ++++--
 optimum/exporters/onnx/base.py                |  8 ++--
 optimum/exporters/onnx/model_configs.py       |  6 +++
 optimum/exporters/tasks.py                    | 33 ++++++-------
 tests/exporters/exporters_utils.py            |  1 +
 .../exporters/onnx/test_exporters_onnx_cli.py | 46 +++++++++++++++++--
 tests/exporters/onnx/test_onnx_export.py      | 35 ++++++++++----
 7 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 34c85173f80..c6ccf7de6e7 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -369,15 +369,22 @@ def main_export(
                 f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
-        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
-            task, exporter="onnx"
-        ):
+        if model.config.model_type.replace("-", "_") not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
+        elif task not in TasksManager.get_supported_tasks_for_model_type(model.config.model_type, "onnx"):
+            if original_task == "auto":
+                autodetected_message = " (auto-detected)"
+            else:
+                autodetected_message = ""
+            model_tasks = TasksManager.get_supported_tasks_for_model_type(model.config.model_type, exporter="onnx")
+            raise ValueError(
+                f"Asked to export a {model.config.model_type} model for the task {task}{autodetected_message}, but the Optimum ONNX exporter only supports the tasks {', '.join(model_tasks.keys())} for {model.config.model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model.config.model_type}."
+            )
 
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_onnx_configs is None:
         raise ValueError(
-            f"Trying to export a {model.config.model_type.replace('-', '_')} model, that is a custom or unsupported architecture for the task {task}, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. For the task {task}, the Optimum ONNX exporter supports natively the architectures: {TasksManager.get_supported_model_type_for_task(task, exporter='onnx')}."
+            f"Trying to export a {model.config.model_type} model, that is a custom or unsupported architecture for the task {task}, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model.config.model_type} to be supported natively in the ONNX export."
         )
 
     if custom_architecture and original_task == "auto":
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index ff645b3be2f..f967d0aefac 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -140,6 +140,7 @@ class OnnxConfig(ExportConfig, ABC):
     MIN_TRANSFORMERS_VERSION = GLOBAL_MIN_TRANSFORMERS_VERSION
     PATCHING_SPECS: Optional[List["PatchingSpec"]] = None
     VARIANTS = {"default": "The default ONNX variant."}
+    DEFAULT_VARIANT = "default"
     _TASK_TO_COMMON_OUTPUTS = {
         "audio-classification": OrderedDict({"logits": {0: "batch_size"}}),
         "audio-frame-classification": OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}}),
@@ -200,11 +201,6 @@ def __init__(
         int_dtype: str = "int64",
         float_dtype: str = "fp32",
     ):
-        # Isn't this check useless?
-        if task not in self._TASK_TO_COMMON_OUTPUTS and task != "text-to-speech":
-            raise ValueError(
-                f"{task} is not a supported task, supported tasks: {', '.join(self._TASK_TO_COMMON_OUTPUTS.keys())}"
-            )
         self.task = task
         self.int_dtype = int_dtype
         self.float_dtype = float_dtype
@@ -212,6 +208,7 @@ def __init__(
         self._config = config
         self._preprocessors = preprocessors
         self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+        self.variant = "default"
 
     def _create_dummy_input_generator_classes(self, **kwargs) -> List[DummyInputGenerator]:
         """
@@ -1010,6 +1007,7 @@ def __init__(self, config: OnnxConfig, int_dtype: str = "int64", float_dtype: st
         self.float_dtype = float_dtype
         self._normalized_config = self._onnx_config._normalized_config
         self.PATCHING_SPECS = self._onnx_config.PATCHING_SPECS
+        self.variant = "default"
 
     @classmethod
     def from_onnx_config(cls, config: OnnxConfig) -> "OnnxConfigWithLoss":
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d3ff9944b92..cc64076025c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -327,6 +327,8 @@ class T5OnnxConfig(TextSeq2SeqOnnxConfig):
         num_attention_heads="num_heads",
         encoder_num_layers="num_layers",
         decoder_num_layers="num_decoder_layers",
+        key_value_dim="d_kv",
+        allow_new=True,
     )
 
     def generate_dummy_inputs_for_validation(
@@ -1226,6 +1228,10 @@ def __init__(
             behavior=behavior,
             preprocessors=preprocessors,
         )
+        if float_dtype == "fp16":
+            raise ValueError(
+                "The ONNX export of SpeechT5 in float16 is currently not supported due to a bug in PyTorch: https://github.com/pytorch/pytorch/pull/110078. Please open an issue in Optimum if you would like to export SpeechT5 in float16."
+            )
         self.is_postnet_and_vocoder = is_postnet_and_vocoder
 
     @property
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index aff43b07ad9..72b6a8a7c24 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -175,7 +175,7 @@ class TasksManager:
             "object-detection": "AutoModelForObjectDetection",
             "question-answering": "AutoModelForQuestionAnswering",
             "semantic-segmentation": "AutoModelForSemanticSegmentation",
-            "text-to-speech": "AutoModelForTextToSpectrogram",
+            "text-to-audio": "AutoModelForTextToSpectrogram",
             "text-generation": "AutoModelForCausalLM",
             "text2text-generation": "AutoModelForSeq2SeqLM",
             "text-classification": "AutoModelForSequenceClassification",
@@ -230,22 +230,23 @@ class TasksManager:
         }
 
     _SYNONYM_TASK_MAP = {
-        "sequence-classification": "text-classification",
+        "audio-ctc": "automatic-speech-recognition",
         "causal-lm": "text-generation",
         "causal-lm-with-past": "text-generation-with-past",
+        "default": "feature-extraction",
+        "default-with-past": "feature-extraction-with-past",
+        "masked-lm": "fill-mask",
+        "mask-generation": "feature-extraction",
+        "sentence-similarity": "feature-extraction",
         "seq2seq-lm": "text2text-generation",
         "seq2seq-lm-with-past": "text2text-generation-with-past",
+        "sequence-classification": "text-classification",
         "speech2seq-lm": "automatic-speech-recognition",
         "speech2seq-lm-with-past": "automatic-speech-recognition-with-past",
-        "masked-lm": "fill-mask",
-        "mask-generation": "feature-extraction",
-        "vision2seq-lm": "image-to-text",
-        "default": "feature-extraction",
-        "default-with-past": "feature-extraction-with-past",
-        "audio-ctc": "automatic-speech-recognition",
-        "translation": "text2text-generation",
-        "sentence-similarity": "feature-extraction",
         "summarization": "text2text-generation",
+        "text-to-speech": "text-to-audio",
+        "translation": "text2text-generation",
+        "vision2seq-lm": "image-to-text",
         "zero-shot-classification": "text-classification",
     }
 
@@ -269,12 +270,12 @@ class TasksManager:
 
     # TODO: why feature-extraction-with-past is here?
     _ENCODER_DECODER_TASKS = (
-        "text2text-generation",
         "automatic-speech-recognition",
-        "image-to-text",
+        "document-question-answering",
         "feature-extraction-with-past",
+        "image-to-text",
+        "text2text-generation",
         "visual-question-answering",
-        "document-question-answering",
     )
 
     # TODO: some models here support text-generation export but are not supported in ORTModelForCausalLM
@@ -841,7 +842,7 @@ class TasksManager:
         ),
         # TODO: SpeechT5 can also support audio-to-audio and automatic-speech-recognition.
         "speecht5": supported_tasks_mapping(
-            "text-to-speech",
+            "text-to-audio",
             onnx="SpeechT5OnnxConfig",
         ),
         "splinter": supported_tasks_mapping(
@@ -1398,8 +1399,8 @@ def _infer_task_from_model_name_or_path(
             else:
                 pipeline_tag = getattr(model_info, "pipeline_tag", None)
                 # conversational is not a supported task per se, just an alias that may map to
-                # text-generaton or text2text-generation
-                if pipeline_tag is not None and pipeline_tag != "conversational":
+                # text-generaton or text2text-generation.
+                if pipeline_tag is not None and pipeline_tag not in ["conversational"]:
                     inferred_task_name = TasksManager.map_from_synonym(model_info.pipeline_tag)
                 else:
                     transformers_info = model_info.transformersInfo
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 34ecf444212..266bff6fc07 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -150,6 +150,7 @@
     # Disabled for now because some operator seems to not be supported by ONNX.
     # "mctct": "hf-internal-testing/tiny-random-MCTCTModel",
     "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel",
+    "speecht5": "hf-internal-testing/tiny-random-SpeechT5ForTextToSpeech",
     "xlm": "hf-internal-testing/tiny-random-XLMModel",
     "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta",
     "vision-encoder-decoder": {
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index b1cdedbea84..c723610a128 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -154,6 +154,7 @@ def _onnx_export(
         device: str = "cpu",
         fp16: bool = False,
         variant: str = "default",
+        model_kwargs: Optional[Dict] = None,
     ):
         with TemporaryDirectory() as tmpdir:
             try:
@@ -167,6 +168,7 @@ def _onnx_export(
                     monolith=monolith,
                     no_post_process=no_post_process,
                     _variant=variant,
+                    model_kwargs=model_kwargs,
                 )
             except MinimumVersionError as e:
                 pytest.skip(f"Skipping due to minimum version requirements not met. Full error: {e}")
@@ -270,7 +272,12 @@ def test_exporters_cli_pytorch_cpu(
         # masked-im models use MaskedImageModelingOutput
         if model_type in ["vit", "deit"] and task == "masked-im":
             self.skipTest("Temporarily disabled upon transformers 4.28 release")
-        self._onnx_export(model_name, task, monolith, no_post_process, variant=variant)
+
+        model_kwargs = None
+        if model_type == "speecht5":
+            model_kwargs = {"vocoder": "fxmarty/speecht5-hifigan-tiny"}
+
+        self._onnx_export(model_name, task, monolith, no_post_process, variant=variant, model_kwargs=model_kwargs)
 
     @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS_TINY))
     @require_vision
@@ -294,7 +301,13 @@ def test_exporters_cli_pytorch_gpu(
         if model_type == "sam":
             self.skipTest("sam export on cuda is not supported due to a bug in PyTorch")
 
-        self._onnx_export(model_name, task, monolith, no_post_process, device="cuda", variant=variant)
+        model_kwargs = None
+        if model_type == "speecht5":
+            model_kwargs = {"vocoder": "fxmarty/speecht5-hifigan-tiny"}
+
+        self._onnx_export(
+            model_name, task, monolith, no_post_process, device="cuda", variant=variant, model_kwargs=model_kwargs
+        )
 
     @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS_TINY))
     @require_torch
@@ -311,10 +324,20 @@ def test_exporters_cli_pytorch_with_optimization(
         monolith: bool,
         no_post_process: bool,
     ):
+        model_kwargs = None
+        if model_type == "speecht5":
+            model_kwargs = {"vocoder": "fxmarty/speecht5-hifigan-tiny"}
+
         for optimization_level in ["O1", "O2", "O3"]:
             try:
                 self._onnx_export(
-                    model_name, task, monolith, no_post_process, optimization_level=optimization_level, variant=variant
+                    model_name,
+                    task,
+                    monolith,
+                    no_post_process,
+                    optimization_level=optimization_level,
+                    variant=variant,
+                    model_kwargs=model_kwargs,
                 )
             except NotImplementedError as e:
                 if "Tried to use ORTOptimizer for the model type" in str(
@@ -348,9 +371,20 @@ def test_exporters_cli_pytorch_with_O4_optimization(
         if model_type == "sam":
             self.skipTest("sam export on cuda is not supported due to a bug in PyTorch")
 
+        model_kwargs = None
+        if model_type == "speecht5":
+            model_kwargs = {"vocoder": "fxmarty/speecht5-hifigan-tiny"}
+
         try:
             self._onnx_export(
-                model_name, task, monolith, no_post_process, optimization_level="O4", device="cuda", variant=variant
+                model_name,
+                task,
+                monolith,
+                no_post_process,
+                optimization_level="O4",
+                device="cuda",
+                variant=variant,
+                model_kwargs=model_kwargs,
             )
         except NotImplementedError as e:
             if "Tried to use ORTOptimizer for the model type" in str(
@@ -453,6 +487,10 @@ def test_export_on_fp16(
         if model_type == "ibert":
             self.skipTest("ibert can not be supported in fp16")
 
+        # TODO: test once https://github.com/pytorch/pytorch/pull/110078 is fixed
+        if model_type == "speecht5":
+            self.skipTest("speecht5 can not be supported in fp16 due to a pytorch bug")
+
         self._onnx_export(model_name, task, monolith, no_post_process, variant=variant, fp16=True, device="cuda")
 
     @parameterized.expand(
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index 10eaeddd13c..cabf16a69fe 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -39,6 +39,7 @@
 from optimum.exporters.onnx.base import ConfigBehavior
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig
 from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
+from optimum.exporters.onnx.utils import get_speecht5_models_for_export
 from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig
 from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_timm
 
@@ -215,6 +216,7 @@ def _onnx_export(
         if isinstance(atol, dict):
             atol = atol[task.replace("-with-past", "")]
 
+        model_kwargs = None
         if (
             model.config.is_encoder_decoder
             and task.startswith(
@@ -230,6 +232,9 @@ def _onnx_export(
             models_and_onnx_configs = get_encoder_decoder_models_for_export(model, onnx_config)
         elif task.startswith("text-generation") and monolith is False:
             models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
+        elif model.config.model_type == "speecht5":
+            model_kwargs = {"vocoder": "fxmarty/speecht5-hifigan-tiny"}
+            models_and_onnx_configs = get_speecht5_models_for_export(model, onnx_config, model_kwargs)
         else:
             models_and_onnx_configs = {"model": (model, onnx_config)}
 
@@ -239,6 +244,7 @@ def _onnx_export(
                 opset=onnx_config.DEFAULT_ONNX_OPSET,
                 output_dir=Path(tmpdirname),
                 device=device,
+                model_kwargs=model_kwargs,
             )
             input_shapes_iterator = grid_parameters(shapes_to_validate, yield_dict=True, add_test_name=False)
             for input_shapes in input_shapes_iterator:
@@ -268,6 +274,7 @@ def _onnx_export(
                         output_dir=Path(tmpdirname),
                         input_shapes=input_shapes,
                         device=device,
+                        model_kwargs=model_kwargs,
                     )
                 except AtolError as e:
                     print(f"The ONNX export succeeded with the warning: {e}")
@@ -317,15 +324,18 @@ def test_all_models_tested(self):
     def test_pytorch_export_on_cpu(
         self,
         test_name,
-        name,
+        model_type,
         model_name,
         task,
         onnx_config_class_constructor,
         monolith: bool,
     ):
+        if model_type == "speecht5" and monolith:
+            self.skipTest("unsupported export")
+
         self._onnx_export(
             test_name,
-            name,
+            model_type,
             model_name,
             task,
             onnx_config_class_constructor,
@@ -343,15 +353,18 @@ def test_pytorch_export_on_cpu(
     def test_pytorch_export_on_cuda(
         self,
         test_name,
-        name,
+        model_type,
         model_name,
         task,
         onnx_config_class_constructor,
         monolith: bool,
     ):
+        if model_type == "speecht5" and monolith:
+            self.skipTest("unsupported export")
+
         self._onnx_export(
             test_name,
-            name,
+            model_type,
             model_name,
             task,
             onnx_config_class_constructor,
@@ -366,11 +379,13 @@ def test_pytorch_export_on_cuda(
     @require_tf
     @require_vision
     @pytest.mark.tensorflow_test
-    def test_tensorflow_export(self, test_name, name, model_name, task, onnx_config_class_constructor, monolith: bool):
+    def test_tensorflow_export(
+        self, test_name, model_type, model_name, task, onnx_config_class_constructor, monolith: bool
+    ):
         if monolith is False:
             return 0
 
-        self._onnx_export(test_name, name, model_name, task, onnx_config_class_constructor, monolith=monolith)
+        self._onnx_export(test_name, model_type, model_name, task, onnx_config_class_constructor, monolith=monolith)
 
     @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
     @require_torch
@@ -400,7 +415,7 @@ def test_pytorch_export_for_stable_diffusion_models_cuda(self, model_type, model
     def test_pytorch_export_for_timm_on_cpu(
         self,
         test_name,
-        name,
+        model_type,
         model_name,
         task,
         onnx_config_class_constructor,
@@ -408,7 +423,7 @@ def test_pytorch_export_for_timm_on_cpu(
     ):
         self._onnx_export(
             test_name,
-            name,
+            model_type,
             model_name,
             task,
             onnx_config_class_constructor,
@@ -428,7 +443,7 @@ def test_pytorch_export_for_timm_on_cpu(
     def test_pytorch_export_for_timm_on_cuda(
         self,
         test_name,
-        name,
+        model_type,
         model_name,
         task,
         onnx_config_class_constructor,
@@ -436,7 +451,7 @@ def test_pytorch_export_for_timm_on_cuda(
     ):
         self._onnx_export(
             test_name,
-            name,
+            model_type,
             model_name,
             task,
             onnx_config_class_constructor,

From 69313a1a9daf3e8cdd7df761597546c354d9bbc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 26 Sep 2023 17:36:16 +0200
Subject: [PATCH 08/16] add doc

---
 docs/source/exporters/onnx/overview.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 34d7b62cf4a..0a548a3d3a1 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -81,6 +81,7 @@ Supported architectures:
 - SEW
 - SEW-D
 - Speech2Text
+- SpeechT5
 - Splinter
 - SqueezeBert
 - Stable Diffusion

From b88ed066bd867f126bff36a89c1187d9fc2b8c3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 26 Sep 2023 18:08:36 +0200
Subject: [PATCH 09/16] working merged onnx

---
 optimum/exporters/onnx/base.py        |  9 ++++++---
 optimum/onnx/transformations_utils.py | 16 ++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index f967d0aefac..fff436823fd 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -904,7 +904,7 @@ def post_process_exported_models(
         )
 
         # Attempt to merge only if the decoder was exported without/with past
-        if self.use_past is True and len(models_and_onnx_configs) == 3:
+        if self.use_past is True or self.variant == "with-past":
             decoder_path = Path(path, onnx_files_subpaths[1])
             decoder_with_past_path = Path(path, onnx_files_subpaths[2])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
@@ -923,7 +923,8 @@ def post_process_exported_models(
             # In order to do the validation of the two branches on the same file
             encoder_path = onnx_files_subpaths[0]
 
-            onnx_files_subpaths = [encoder_path, decoder_merged_path.name, decoder_merged_path.name]
+            onnx_files_subpaths_new = [encoder_path, decoder_merged_path.name, decoder_merged_path.name]
+            onnx_files_subpaths_new.extend(onnx_files_subpaths[3:])
 
             # We validate the two branches of the decoder model then
             models_and_onnx_configs[ONNX_DECODER_NAME][1].is_merged = True
@@ -934,8 +935,10 @@ def post_process_exported_models(
 
             models_and_onnx_configs[ONNX_DECODER_WITH_PAST_NAME][1].use_cache_branch = True
             models_and_onnx_configs[ONNX_DECODER_WITH_PAST_NAME][1].is_merged = True
+        else:
+            onnx_files_subpaths_new = onnx_files_subpaths
 
-        return models_and_onnx_configs, onnx_files_subpaths
+        return models_and_onnx_configs, onnx_files_subpaths_new
 
     def generate_dummy_inputs_for_validation(
         self, reference_model_inputs: Dict[str, Any], onnx_input_names: Optional[List[str]] = None
diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py
index c5b3ad417ba..05931753bfd 100644
--- a/optimum/onnx/transformations_utils.py
+++ b/optimum/onnx/transformations_utils.py
@@ -178,16 +178,20 @@ def _unify_onnx_outputs(model1: ModelProto, model2: ModelProto, strict: bool):
                 if strict is False and model_output_1.name not in model2_outputs:
                     data_type = model_output_1.type.tensor_type.elem_type
                     dims_output_1 = _infer_output_shape(model_output_1)
-                    if not isinstance(dims_output_1[0], str):
+                    if not any(isinstance(dim_output, str) for dim_output in dims_output_1):
                         raise ValueError(
-                            f"Expected a dynamic shape for the axis zero of {model_output_1.name}, found a static shape: {dims_output_1[0]}"
+                            f"Expected at least one dynamic input shape for the output {model_output_1.name}, found a static shape: {dims_output_1}"
                         )
 
-                    # fill the constant shape with the original shape, except for the axis zero that is 0 for an empty constant,
+                    # fill the constant shape with the original shape, except for the first dynamic axis that is 0 for an empty constant,
                     # and the dynamic axis set to 1
-                    dims_dummy_output = [0]
-                    for dim in dims_output_1[1:]:
-                        if isinstance(dim, str):
+                    dims_dummy_output = []
+                    dummy_axis = None
+                    for j, dim in enumerate(dims_output_1):
+                        if isinstance(dim, str) and dummy_axis is None:
+                            dims_dummy_output.append(0)
+                            dummy_axis = j
+                        elif isinstance(dim, str) and dummy_axis is not None:
                             dims_dummy_output.append(1)
                         else:
                             dims_dummy_output.append(dim)

From 918893eec6b68214c964aec90cf8c40396ba66bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 5 Oct 2023 13:03:11 +0200
Subject: [PATCH 10/16] fix dropout with training=True export

---
 optimum/exporters/onnx/model_patcher.py | 51 +++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 63edefb63cc..82cec1eeb5e 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -15,6 +15,7 @@
 import dataclasses
 import functools
 import inspect
+import types
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
@@ -355,7 +356,55 @@ def patched_forward(
         self.patched_forward = patched_forward
 
 
+def patched_speecht5_prenet_forward(
+    self,
+    input_values: torch.Tensor,
+    speaker_embeddings: Optional[torch.Tensor] = None,
+):
+    # Dropout is always applied, even when evaluating. See §2.2 in https://arxiv.org/abs/1712.05884.
+
+    inputs_embeds = input_values
+    for layer in self.layers:
+        inputs_embeds = torch.nn.functional.relu(layer(inputs_embeds))
+
+        # NOTE: we patch the prenet to avoid using torch.nn.functional.dropout, that is exported as a `Dropout` node in the ONNX
+        # that is ignored during inference by some runtimes as ONNX Runtime.
+        # Reference: https://github.com/microsoft/onnxruntime/issues/9333 & https://github.com/microsoft/onnxruntime/issues/5549
+        mask = torch.rand(inputs_embeds.shape, device=inputs_embeds.device) > self.config.speech_decoder_prenet_dropout
+        inputs_embeds = inputs_embeds * mask / (1 - self.config.speech_decoder_prenet_dropout)
+
+        # inputs_embeds = nn.functional.dropout(
+        #     inputs_embeds, self.config.speech_decoder_prenet_dropout, training=True
+        # )
+
+    inputs_embeds = self.final_layer(inputs_embeds)
+    inputs_embeds = self.encode_positions(inputs_embeds)
+
+    if speaker_embeddings is not None:
+        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings)
+        speaker_embeddings = speaker_embeddings.unsqueeze(1)
+        speaker_embeddings = speaker_embeddings.expand(-1, inputs_embeds.size(1), -1)
+        inputs_embeds = torch.cat([inputs_embeds, speaker_embeddings], dim=-1)
+        inputs_embeds = torch.nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))
+
+    return inputs_embeds
+
+
 class SpeechT5ModelPatcher(ModelPatcher):
+    def __enter__(self):
+        self.patch_ops()
+        self._model.speecht5.decoder.prenet.forward = types.MethodType(
+            patched_speecht5_prenet_forward, self._model.speecht5.decoder.prenet
+        )
+        setattr(self._model, self.orig_forward_name, self.patched_forward)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.restore_ops()
+        setattr(self._model, self.orig_forward_name, self.orig_forward)
+        self._model.speecht5.decoder.prenet.forward = types.MethodType(
+            self.original_speecht5_prenet_forward, self._model.speecht5.decoder.prenet
+        )
+
     def __init__(
         self,
         config: "OnnxConfig",
@@ -364,6 +413,8 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
+        self.original_speecht5_prenet_forward = model.speecht5.decoder.prenet.forward
+
         model.vocoder = model_kwargs["vocoder_model"].eval()
 
         def patched_forward(

From 74ba08c9f7b90ecce1a44e32f0c1696ca92baf28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 5 Oct 2023 14:56:14 +0200
Subject: [PATCH 11/16] test fix

---
 optimum/exporters/onnx/__main__.py | 13 ++-----------
 optimum/exporters/onnx/base.py     |  4 ++--
 optimum/exporters/tasks.py         | 12 +++++++++++-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index c6ccf7de6e7..3dbee581062 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -366,20 +366,11 @@ def main_export(
     if not is_stable_diffusion:
         if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
             raise ValueError(
-                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
+                f"{model_type} is not supported yet. Only {list(TasksManager._SUPPORTED_CLI_MODEL_TYPE.keys())} are supported. "
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
-        if model.config.model_type.replace("-", "_") not in TasksManager._SUPPORTED_MODEL_TYPE:
+        if model.config.model_type.replace("_", "-") not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
-        elif task not in TasksManager.get_supported_tasks_for_model_type(model.config.model_type, "onnx"):
-            if original_task == "auto":
-                autodetected_message = " (auto-detected)"
-            else:
-                autodetected_message = ""
-            model_tasks = TasksManager.get_supported_tasks_for_model_type(model.config.model_type, exporter="onnx")
-            raise ValueError(
-                f"Asked to export a {model.config.model_type} model for the task {task}{autodetected_message}, but the Optimum ONNX exporter only supports the tasks {', '.join(model_tasks.keys())} for {model.config.model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model.config.model_type}."
-            )
 
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_onnx_configs is None:
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index fff436823fd..635b3b2997a 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -903,8 +903,8 @@ def post_process_exported_models(
             path, models_and_onnx_configs, onnx_files_subpaths
         )
 
-        # Attempt to merge only if the decoder was exported without/with past
-        if self.use_past is True or self.variant == "with-past":
+        # Attempt to merge only if the decoder was exported without/with past, and ignore seq2seq models exported with text-generation task
+        if len(onnx_files_subpaths) >= 3 and self.use_past is True or self.variant == "with-past":
             decoder_path = Path(path, onnx_files_subpaths[1])
             decoder_with_past_path = Path(path, onnx_files_subpaths[2])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 72b6a8a7c24..8cf8cae4863 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1053,7 +1053,7 @@ def get_supported_tasks_for_model_type(
             `TaskNameToExportConfigDict`: The dictionary mapping each task to a corresponding `ExportConfig`
             constructor.
         """
-        model_type = model_type.lower()
+        model_type = model_type.lower().replace("_", "-")
         model_type_and_model_name = f"{model_type} ({model_name})" if model_name else model_type
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             raise KeyError(
@@ -1687,6 +1687,16 @@ def get_model_from_task(
                 if original_task == "auto" and config.architectures is not None:
                     model_class_name = config.architectures[0]
 
+            if task not in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx"):
+                if original_task == "auto":
+                    autodetected_message = " (auto-detected)"
+                else:
+                    autodetected_message = ""
+                model_tasks = TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx")
+                raise ValueError(
+                    f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum ONNX exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
+                )
+
         model_class = TasksManager.get_model_class_for_task(
             task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name
         )

From c5a8a1d89a336bcf41ece1c89598d9f6d4f09aaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 5 Oct 2023 15:52:38 +0200
Subject: [PATCH 12/16] fix custom models

---
 optimum/exporters/onnx/__main__.py | 26 +++++++++++++++++++++++++-
 optimum/exporters/tasks.py         | 12 +-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 3dbee581062..fcafd3fd8d9 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 from transformers.utils import is_torch_available
 
 from ...commands.export.onnx import parse_args_onnx
@@ -338,6 +338,30 @@ def main_export(
                 f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
             )
 
+    if library_name == "transformers":
+        config = AutoConfig.from_pretrained(
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            use_auth_token=use_auth_token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+            trust_remote_code=trust_remote_code,
+        )
+        model_type = config.model_type.replace("_", "-")
+        if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+            custom_architecture = True
+        elif task not in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx"):
+            if original_task == "auto":
+                autodetected_message = " (auto-detected)"
+            else:
+                autodetected_message = ""
+            model_tasks = TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx")
+            raise ValueError(
+                f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum ONNX exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
+            )
+
     model = TasksManager.get_model_from_task(
         task,
         model_name_or_path,
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 8cf8cae4863..baf163b1691 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1058,7 +1058,7 @@ def get_supported_tasks_for_model_type(
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             raise KeyError(
                 f"{model_type_and_model_name} is not supported yet. "
-                f"Only {TasksManager._SUPPORTED_MODEL_TYPE} are supported. "
+                f"Only {list(TasksManager._SUPPORTED_MODEL_TYPE.keys())} are supported. "
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
         elif exporter not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]:
@@ -1687,16 +1687,6 @@ def get_model_from_task(
                 if original_task == "auto" and config.architectures is not None:
                     model_class_name = config.architectures[0]
 
-            if task not in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx"):
-                if original_task == "auto":
-                    autodetected_message = " (auto-detected)"
-                else:
-                    autodetected_message = ""
-                model_tasks = TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx")
-                raise ValueError(
-                    f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum ONNX exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
-                )
-
         model_class = TasksManager.get_model_class_for_task(
             task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name
         )

From 2f9661d31b069b2018e0a2be82774a00b4e0245d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 5 Oct 2023 16:08:38 +0200
Subject: [PATCH 13/16] some cleaning

---
 optimum/exporters/onnx/model_patcher.py | 14 +-------------
 optimum/exporters/tasks.py              |  2 +-
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 82cec1eeb5e..c90b5baebaf 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -16,7 +16,7 @@
 import functools
 import inspect
 import types
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
 from transformers.utils import is_torch_available
@@ -36,18 +36,6 @@
 logger = logging.get_logger(__name__)
 
 
-def get_argument(argument_name: str, args: List[Any], kwargs: Dict[str, Any], forward_signature):
-    """
-    Get the argument argument_name from the args and kwargs according to the signature forward_signature.
-    """
-    args = list(args)
-    if argument_name in forward_signature.parameters:
-        argument_index = list(forward_signature.parameters.keys()).index(argument_name)
-        return args[argument_index]
-    else:
-        return kwargs[argument_name]
-
-
 def override_arguments(args, kwargs, forward_signature, model_kwargs: Dict[str, Any]):
     """
     Override the args and kwargs with the argument values from model_kwargs, following the signature forward_signature corresponding to args and kwargs.
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index baf163b1691..a7b2d28facd 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1400,7 +1400,7 @@ def _infer_task_from_model_name_or_path(
                 pipeline_tag = getattr(model_info, "pipeline_tag", None)
                 # conversational is not a supported task per se, just an alias that may map to
                 # text-generaton or text2text-generation.
-                if pipeline_tag is not None and pipeline_tag not in ["conversational"]:
+                if pipeline_tag is not None and pipeline_tag != "conversational":
                     inferred_task_name = TasksManager.map_from_synonym(model_info.pipeline_tag)
                 else:
                     transformers_info = model_info.transformersInfo

From 595ff148d8a4a41c434687ae8d4bed02a9e890e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 17 Oct 2023 15:16:43 +0200
Subject: [PATCH 14/16] merge mess

---
 optimum/commands/export/onnx.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
index 7625c5af74e..55f8b9dc1d3 100644
--- a/optimum/commands/export/onnx.py
+++ b/optimum/commands/export/onnx.py
@@ -142,6 +142,14 @@ def parse_args_onnx(parser):
         type=json.loads,
         help=("Any kwargs passed to the model forward, or used to customize the export for a given model."),
     )
+    optional_group.add_argument(
+        "--legacy",
+        action="store_true",
+        help=(
+            "Export decoder only models in three files (without + with past and the resulting merged model)."
+            "Also disable the use of position_ids for text-generation models that require it for batched generation. This argument is introduced for backward compatibility and will be removed in a future release of Optimum."
+        ),
+    )
 
     input_group = parser.add_argument_group(
         "Input shapes (if necessary, this allows to override the shapes of the input given to the ONNX exporter, that requires an example input)."

From 563424cfc8ad9611e66bbc1d94b4018d6d54a9f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 17 Oct 2023 15:20:43 +0200
Subject: [PATCH 15/16] address review comments

---
 optimum/exporters/onnx/base.py          |  2 +-
 optimum/exporters/onnx/model_configs.py | 37 +------------------------
 optimum/utils/__init__.py               |  1 +
 optimum/utils/input_generators.py       | 36 ++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index f7177845e37..1e5704e8937 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -904,7 +904,7 @@ def post_process_exported_models(
         )
 
         # Attempt to merge only if the decoder was exported without/with past, and ignore seq2seq models exported with text-generation task
-        if len(onnx_files_subpaths) >= 3 and self.use_past is True or self.variant == "with-past":
+        if len(onnx_files_subpaths) >= 3 and self.use_past is True:
             decoder_path = Path(path, onnx_files_subpaths[1])
             decoder_with_past_path = Path(path, onnx_files_subpaths[2])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 7bb48f887fa..7f9e0bb0f85 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -30,6 +30,7 @@
     DummyPointsGenerator,
     DummySeq2SeqDecoderTextInputGenerator,
     DummySeq2SeqPastKeyValuesGenerator,
+    DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
     DummyVisionEmbeddingsGenerator,
@@ -1219,42 +1220,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return common_outputs
 
 
-class DummySpeechT5InputGenerator(DummyInputGenerator):
-    SUPPORTED_INPUT_NAMES = ("output_sequence", "speaker_embeddings", "spectrogram")
-
-    def __init__(
-        self,
-        task: str,
-        normalized_config: NormalizedConfig,
-        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
-        **kwargs,
-    ):
-        self.task = task
-        self.batch_size = 1  # TODO: SpeechT5 does not support batch inference in Transformers for now.
-
-        self.sequence_length = sequence_length
-        self.speaker_embedding_dim = normalized_config.speaker_embedding_dim
-        self.num_mel_bins = normalized_config.num_mel_bins
-
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name == "output_sequence":
-            shape = [self.batch_size, self.sequence_length, self.num_mel_bins]
-        elif input_name == "speaker_embeddings":
-            shape = [self.batch_size, self.speaker_embedding_dim]
-        elif input_name == "spectrogram":
-            shape = [20, self.num_mel_bins]  # NOTE: the first axis length is arbitrary and dynamic
-        else:
-            raise ValueError(f"Unsupported input {input_name} for DummySpeechT5InputGenerator")
-
-        return self.random_float_tensor(
-            shape=shape,
-            min_value=0,
-            max_value=1,
-            framework=framework,
-            dtype=float_dtype,
-        )
-
-
 class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     # TODO: Transformers batched generation for Speecht5 is BROKEN (https://github.com/huggingface/transformers/pull/25943),
     # so we won't support for now.
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 03a6c0bdec3..553fa50d4c3 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -55,6 +55,7 @@
     DummyPointsGenerator,
     DummySeq2SeqDecoderTextInputGenerator,
     DummySeq2SeqPastKeyValuesGenerator,
+    DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
     DummyVisionEmbeddingsGenerator,
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index e33e86f2ab7..5a9e1f62838 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -946,3 +946,39 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
             )
             for _ in range(self.num_layers)
         ]
+
+
+class DummySpeechT5InputGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("output_sequence", "speaker_embeddings", "spectrogram")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = 1  # TODO: SpeechT5 does not support batch inference in Transformers for now.
+
+        self.sequence_length = sequence_length
+        self.speaker_embedding_dim = normalized_config.speaker_embedding_dim
+        self.num_mel_bins = normalized_config.num_mel_bins
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "output_sequence":
+            shape = [self.batch_size, self.sequence_length, self.num_mel_bins]
+        elif input_name == "speaker_embeddings":
+            shape = [self.batch_size, self.speaker_embedding_dim]
+        elif input_name == "spectrogram":
+            shape = [20, self.num_mel_bins]  # NOTE: the first axis length is arbitrary and dynamic
+        else:
+            raise ValueError(f"Unsupported input {input_name} for DummySpeechT5InputGenerator")
+
+        return self.random_float_tensor(
+            shape=shape,
+            min_value=0,
+            max_value=1,
+            framework=framework,
+            dtype=float_dtype,
+        )

From bce548a4d80a17c8a265f851342055099b3234bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 18 Oct 2023 11:01:28 +0200
Subject: [PATCH 16/16] fix tests

---
 tests/onnxruntime/test_modeling.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 7794bd7b2d4..6bcbf111e9c 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -1090,7 +1090,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForQuestionAnswering.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("custom or unsupported architecture", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -1252,7 +1252,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForMaskedLM.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -1409,7 +1409,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForSequenceClassification.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("that is a custom or unsupported", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -1582,7 +1582,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForTokenClassification.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -1994,7 +1994,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForCausalLM.from_pretrained(MODEL_NAMES["vit"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_merge_from_onnx_and_save(self, model_arch):
@@ -2400,7 +2400,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForImageClassification.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -2540,7 +2540,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForSemanticSegmentation.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -2695,7 +2695,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForAudioClassification.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -2847,7 +2847,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForCTC.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -2906,7 +2906,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForAudioXVector.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -2998,7 +2998,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForAudioFrameClassification.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -3087,7 +3087,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForSeq2SeqLM.from_pretrained(MODEL_NAMES["bert"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str):
@@ -3697,7 +3697,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForSpeechSeq2Seq.from_pretrained(MODEL_NAMES["bert"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str):
@@ -4066,7 +4066,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForVision2Seq.from_pretrained(MODEL_NAMES["bert"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(
         grid_parameters(
@@ -4480,7 +4480,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForPix2Struct.from_pretrained(MODEL_NAMES["bert"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_merge_from_transformers_and_save(self, model_arch):