diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index 660f417019b..c429b706bff 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -1,19 +1,11 @@
-name: check_code_quality
+name: Code Quality
 
 on:
   push:
-    branches: [ main ]
-    paths:
-      - "optimum/**.py"
-      - "tests/**.py"
-      - "examples/**.py"
+    branches: [main]
 
   pull_request:
-    branches: [ main ]
-    paths:
-      - "optimum/**.py"
-      - "tests/**.py"
-      - "examples/**.py"
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -29,25 +21,23 @@ jobs:
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Create and start a virtual environment
-      run: |
-        python -m venv venv
-        source venv/bin/activate
-    - name: Install dependencies
-      run: |
-        source venv/bin/activate
-        pip install --upgrade pip
-        pip install .[quality]
-    - name: Check style with black
-      run: |
-        source venv/bin/activate
-        black --check .
-    - name: Check style with ruff
-      run: |
-        source venv/bin/activate
-        ruff .
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[quality]
+
+      - name: Check style with black
+        run: |
+          black --check .
+
+      - name: Check style with ruff
+        run: |
+          ruff .
diff --git a/.github/workflows/test_gptq.yml b/.github/workflows/test_gptq.yml
index 0f3c31c6d2c..7e7d3959a6b 100644
--- a/.github/workflows/test_gptq.yml
+++ b/.github/workflows/test_gptq.yml
@@ -1,29 +1,46 @@
-name: GPTQ Quantization / Test GPU
+name: GPTQ / Python - Test
 
 on:
   workflow_dispatch:
-  schedule:
-    - cron: 0 1 */3 * * # at 1am every 3 days
+  push:
+    branches: [main]
+    paths:
+      - tests/gptq/**
+      - optimum/gptq/**
+      - .github/workflows/test_gptq.yml
   pull_request:
-    types: [opened, synchronize, reopened, labeled]
-  # uncomment to enable on PR merge on main branch:
-  #push:
-  #  branches:
-  #    - main
+    branches: [main]
+    paths:
+      - tests/gptq/**
+      - optimum/gptq/**
+      - .github/workflows/test_gptq.yml
+  schedule:
+    # every day at midnight
+    - cron: "0 0 * * *"
 
 jobs:
-  do-the-job:
-    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
-    name: Start self-hosted EC2 runner
+  test_gptq:
     runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    env:
-      AWS_REGION: us-east-1
+
     steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Build image
-        run: |
-          docker build -f tests/gptq/Dockerfile_quantization_gpu -t gptq-gpu .
-      - name: Test with unittest within docker container
-        run: |
-          docker run --rm --gpus all -v $(pwd)/hf_cache:/root/.cache/huggingface --workdir=/workspace/optimum/tests gptq-gpu:latest
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Run tests
+        uses: addnab/docker-run-action@v3
+        with:
+          image: pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime
+          # latest auto-gptq was built with pytorch 2.2 and cuda 12.1
+          options: |
+            --rm
+            --gpus all
+            --shm-size 16G
+            --env RUN_SLOW=1
+            --env HF_HOME=/mnt/cache/
+            --volume /mnt/cache/:/mnt/cache/
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install auto-gptq
+            pip install -e .[tests]
+            pytest tests/gptq -s -vvvv --durations=0
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 5a21f12d015..9aa8b307235 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install .[tests,onnxruntime,exporters-tf]
+        pip install .[tests,exporters]
     - name: Test with unittest
       working-directory: tests
       run: |
diff --git a/README.md b/README.md
index c892a142994..9a6403cdacb 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,7 @@ It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO
 optimum-cli export openvino --model distilbert-base-uncased-finetuned-sst-2-english distilbert_sst2_ov
 ```
 
-If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#weight-only-quantization) for more detail on weight only quantization. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#static-quantization).
-
+If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/export) for more detail. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/intel/openvino/optimization#static-quantization).
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -92,13 +91,13 @@ To load a model and run inference with OpenVINO Runtime, you can just replace yo
   model_id = "distilbert-base-uncased-finetuned-sst-2-english"
   tokenizer = AutoTokenizer.from_pretrained(model_id)
 - model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained("distilbert_sst2_ov")
++ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 
   classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
   results = classifier("He's a dreadful magician.")
 ```
 
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
+You can find more examples in the [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
 
 ### Neural Compressor
 
diff --git a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
index 4d227e48c23..84c670579c0 100644
--- a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
+++ b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
@@ -87,7 +87,7 @@ Required arguments:
   output                Path indicating the directory where to store generated ONNX model.
 
 Optional arguments:
-  --task TASK           The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among: ['default', 'fill-mask', 'text-generation', 'text2text-generation', 'text-classification', 'token-classification', 'multiple-choice', 'object-detection', 'question-answering', 'image-classification', 'image-segmentation', 'masked-im', 'semantic-segmentation', 'automatic-speech-recognition', 'audio-classification', 'audio-frame-classification', 'automatic-speech-recognition', 'audio-xvector', 'image-to-text', 'stable-diffusion', 'zero-shot-object-detection']. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder.
+  --task TASK           The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among: ['default', 'fill-mask', 'text-generation', 'text2text-generation', 'text-classification', 'token-classification', 'multiple-choice', 'object-detection', 'question-answering', 'image-classification', 'image-segmentation', 'masked-im', 'semantic-segmentation', 'automatic-speech-recognition', 'audio-classification', 'audio-frame-classification', 'automatic-speech-recognition', 'audio-xvector', 'image-to-text', 'zero-shot-object-detection', 'image-to-image', 'inpainting', 'text-to-image']. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder.
   --monolith            Force to export the model as a single ONNX file. By default, the ONNX exporter may break the model in several ONNX files, for example for encoder-decoder models where the encoder should be run only once while the decoder is looped over.
   --device DEVICE       The device to use to do the export. Defaults to "cpu".
   --opset OPSET         If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used.
diff --git a/docs/source/exporters/tflite/usage_guides/export_a_model.mdx b/docs/source/exporters/tflite/usage_guides/export_a_model.mdx
index 8666f445432..ff06af8fb3c 100644
--- a/docs/source/exporters/tflite/usage_guides/export_a_model.mdx
+++ b/docs/source/exporters/tflite/usage_guides/export_a_model.mdx
@@ -59,7 +59,7 @@ Optional arguments:
                         the model, but are among: ['default', 'fill-mask', 'text-generation', 'text2text-generation', 'text-classification', 'token-classification',
                         'multiple-choice', 'object-detection', 'question-answering', 'image-classification', 'image-segmentation', 'masked-im', 'semantic-
                         segmentation', 'automatic-speech-recognition', 'audio-classification', 'audio-frame-classification', 'automatic-speech-recognition', 'audio-xvector', 'vision2seq-
-                        lm', 'stable-diffusion', 'zero-shot-object-detection']. For decoder models, use `xxx-with-past` to export the model using past key
+                        lm', 'zero-shot-object-detection', 'text-to-image', 'image-to-image', 'inpainting']. For decoder models, use `xxx-with-past` to export the model using past key
                         values in the decoder.
   --atol ATOL           If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
   --pad_token_id PAD_TOKEN_ID
diff --git a/examples/onnxruntime/optimization/multiple-choice/run_swag.py b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
index 3c43846b9a5..bcddc975907 100644
--- a/examples/onnxruntime/optimization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -236,7 +236,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -254,13 +253,18 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForMultipleChoice.from_pretrained(
+        optimized_model_path,
+        provider=optim_args.execution_provider,
+    )
+
     if training_args.do_eval:
         # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
         # prediction step(s)
@@ -339,13 +343,12 @@ def compute_metrics(eval_predictions):
         # Evaluation
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/optimization/question-answering/run_qa.py b/examples/onnxruntime/optimization/question-answering/run_qa.py
index 04a9bd34f36..407714cb01f 100644
--- a/examples/onnxruntime/optimization/question-answering/run_qa.py
+++ b/examples/onnxruntime/optimization/question-answering/run_qa.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -305,7 +305,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -323,13 +322,15 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForQuestionAnswering.from_pretrained(optimized_model_path, provider=optim_args.execution_provider)
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -478,13 +479,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -514,12 +514,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/optimization/text-classification/README.md b/examples/onnxruntime/optimization/text-classification/README.md
index 42a99cc73d3..3a7dce2b59f 100644
--- a/examples/onnxruntime/optimization/text-classification/README.md
+++ b/examples/onnxruntime/optimization/text-classification/README.md
@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Text classification 
+# Text classification
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py)
-allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies graph optimization on a DistilBERT fine-tuned on the sst-2 task. Here the optimization level is selected to be 1, enabling basic optimizations such as redundant node eliminations and constant folding. Higher optimization level will result in hardware dependent optimized graph.
 
diff --git a/examples/onnxruntime/optimization/text-classification/run_glue.py b/examples/onnxruntime/optimization/text-classification/run_glue.py
index a07193915b8..222dda15074 100644
--- a/examples/onnxruntime/optimization/text-classification/run_glue.py
+++ b/examples/onnxruntime/optimization/text-classification/run_glue.py
@@ -42,7 +42,7 @@
 
 from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -250,7 +250,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
 
@@ -268,13 +267,17 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForSequenceClassification.from_pretrained(
+        optimized_model_path, provider=optim_args.execution_provider
+    )
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -408,13 +411,13 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the evaluation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -436,10 +439,12 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the test dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path, execution_provider=optim_args.execution_provider, label_names=["label"]
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/optimization/token-classification/run_ner.py b/examples/onnxruntime/optimization/token-classification/run_ner.py
index 73db3671d2f..2e7b63792c3 100644
--- a/examples/onnxruntime/optimization/token-classification/run_ner.py
+++ b/examples/onnxruntime/optimization/token-classification/run_ner.py
@@ -38,7 +38,7 @@
 
 from optimum.onnxruntime import ORTModelForTokenClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -276,7 +276,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -480,12 +479,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the validation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -509,12 +507,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the prediction dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/examples/onnxruntime/quantization/image-classification/run_image_classification.py b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
index 3d0fa72882e..6feaaef4f3b 100644
--- a/examples/onnxruntime/quantization/image-classification/run_image_classification.py
+++ b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
@@ -22,7 +22,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -38,7 +37,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForImageClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -47,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 logger = logging.getLogger(__name__)
@@ -378,13 +377,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForImageClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -409,13 +411,12 @@ def compute_metrics(p: EvalPrediction):
         # Set the validation transforms
         eval_dataset = eval_dataset.with_transform(preprocess_function)
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=[labels_column],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
diff --git a/examples/onnxruntime/quantization/multiple-choice/run_swag.py b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
index 9d9642c12d5..9a8423f836d 100644
--- a/examples/onnxruntime/quantization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
@@ -38,7 +38,6 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
     ExcludeGeLUNodes,
@@ -46,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -409,13 +409,14 @@ def compute_metrics(eval_predictions):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForMultipleChoice.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
@@ -436,13 +437,12 @@ def compute_metrics(eval_predictions):
                 load_from_cache_file=not data_args.overwrite_cache,
             )
 
-        ort_model = ORTModel(
-            os.path.join(training_args.output_dir, "model_quantized.onnx"),
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/quantization/question-answering/README.md b/examples/onnxruntime/quantization/question-answering/README.md
index 380afff8cad..8345ca8e4d0 100644
--- a/examples/onnxruntime/quantization/question-answering/README.md
+++ b/examples/onnxruntime/quantization/question-answering/README.md
@@ -16,13 +16,9 @@ limitations under the License.
 
 # Question answering
 
+The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
 
-The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
-
-Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along 
-the flag `--version_2_with_negative`.
+Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the SQuAD1.0 dataset.
 
diff --git a/examples/onnxruntime/quantization/question-answering/run_qa.py b/examples/onnxruntime/quantization/question-answering/run_qa.py
index 4a6a854fd97..50661b7b420 100644
--- a/examples/onnxruntime/quantization/question-answering/run_qa.py
+++ b/examples/onnxruntime/quantization/question-answering/run_qa.py
@@ -24,7 +24,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -39,7 +38,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForQuestionAnswering
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -48,6 +46,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -651,25 +650,25 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForQuestionAnswering.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -681,12 +680,11 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/quantization/text-classification/README.md b/examples/onnxruntime/quantization/text-classification/README.md
index 460bb56fba8..95fd3335171 100644
--- a/examples/onnxruntime/quantization/text-classification/README.md
+++ b/examples/onnxruntime/quantization/text-classification/README.md
@@ -18,10 +18,7 @@ limitations under the License.
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the sst-2 task.
 
diff --git a/examples/onnxruntime/quantization/text-classification/run_glue.py b/examples/onnxruntime/quantization/text-classification/run_glue.py
index bc141b2194f..4b9ee0403c3 100644
--- a/examples/onnxruntime/quantization/text-classification/run_glue.py
+++ b/examples/onnxruntime/quantization/text-classification/run_glue.py
@@ -23,7 +23,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -44,7 +43,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForSequenceClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -53,6 +51,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -476,13 +475,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForSequenceClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -504,13 +506,13 @@ def compute_metrics(p: EvalPrediction):
                 f" Evaluation results may suffer from a wrong matching."
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -525,12 +527,11 @@ def compute_metrics(p: EvalPrediction):
         if data_args.max_predict_samples is not None:
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/quantization/token-classification/README.md b/examples/onnxruntime/quantization/token-classification/README.md
index f56388ed3c0..540b3cbe2dd 100644
--- a/examples/onnxruntime/quantization/token-classification/README.md
+++ b/examples/onnxruntime/quantization/token-classification/README.md
@@ -16,10 +16,7 @@ limitations under the License.
 
 # Token classification
 
-
-The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks. 
+The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the CoNLL-2003 task
 
diff --git a/examples/onnxruntime/quantization/token-classification/run_ner.py b/examples/onnxruntime/quantization/token-classification/run_ner.py
index 1cc12d3fbc0..3a5798c57a8 100644
--- a/examples/onnxruntime/quantization/token-classification/run_ner.py
+++ b/examples/onnxruntime/quantization/token-classification/run_ner.py
@@ -25,7 +25,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -40,7 +39,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForTokenClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -49,6 +47,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -551,13 +550,16 @@ def compute_metrics(p):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForTokenClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -572,12 +574,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the validation dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -602,12 +603,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the prediction dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/optimum/exporters/onnx/__init__.py b/optimum/exporters/onnx/__init__.py
index 609096e37ef..6b99e484578 100644
--- a/optimum/exporters/onnx/__init__.py
+++ b/optimum/exporters/onnx/__init__.py
@@ -31,7 +31,7 @@
     "utils": [
         "get_decoder_models_for_export",
         "get_encoder_decoder_models_for_export",
-        "get_stable_diffusion_models_for_export",
+        "get_diffusion_models_for_export",
         "MODEL_TYPES_REQUIRING_POSITION_IDS",
     ],
     "__main__": ["main_export"],
@@ -50,7 +50,7 @@
     from .utils import (
         get_decoder_models_for_export,
         get_encoder_decoder_models_for_export,
-        get_stable_diffusion_models_for_export,
+        get_diffusion_models_for_export,
         MODEL_TYPES_REQUIRING_POSITION_IDS,
     )
     from .__main__ import main_export
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 1e36af06ade..703e98df3e2 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -221,13 +221,24 @@ def main_export(
             " and passing it is not required anymore."
         )
 
+    if task in ["stable-diffusion", "stable-diffusion-xl"]:
+        logger.warning(
+            f"The task `{task}` is deprecated and will be removed in a future release of Optimum. "
+            "Please use one of the following tasks instead: `text-to-image`, `image-to-image`, `inpainting`."
+        )
+
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
-    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
-    library_name = TasksManager.infer_library_from_model(
-        model_name_or_path, subfolder=subfolder, library_name=library_name
-    )
+    if framework is None:
+        framework = TasksManager.determine_framework(
+            model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+        )
+
+    if library_name is None:
+        library_name = TasksManager.infer_library_from_model(
+            model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+        )
 
     torch_dtype = None
     if framework == "pt":
@@ -321,9 +332,7 @@ def main_export(
                 )
             model.config.pad_token_id = pad_token_id
 
-    if "stable-diffusion" in task:
-        model_type = "stable-diffusion"
-    elif hasattr(model.config, "export_model_type"):
+    if hasattr(model.config, "export_model_type"):
         model_type = model.config.export_model_type.replace("_", "-")
     else:
         model_type = model.config.model_type.replace("_", "-")
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 4d5a2afc374..63a9067b90c 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -60,7 +60,7 @@
     from transformers.modeling_utils import PreTrainedModel
 
 if is_diffusers_available():
-    from diffusers import ModelMixin
+    from diffusers import DiffusionPipeline, ModelMixin
 
 if is_tf_available():
     from transformers.modeling_tf_utils import TFPreTrainedModel
@@ -264,7 +264,7 @@ def _run_validation(
         atol = config.ATOL_FOR_VALIDATION
 
     if "diffusers" in str(reference_model.__class__) and not is_diffusers_available():
-        raise ImportError("The pip package `diffusers` is required to validate stable diffusion ONNX models.")
+        raise ImportError("The pip package `diffusers` is required to validate diffusion ONNX models.")
 
     framework = "pt" if is_torch_available() and isinstance(reference_model, nn.Module) else "tf"
 
@@ -388,7 +388,7 @@ def _run_validation(
         logger.info(f"\t-[✓] ONNX model output names match reference model ({onnx_output_names})")
 
     if "diffusers" in str(reference_model.__class__) and not is_diffusers_available():
-        raise ImportError("The pip package `diffusers` is required to validate stable diffusion ONNX models.")
+        raise ImportError("The pip package `diffusers` is required to validate diffusion ONNX models.")
 
     # Check the shape and values match
     shape_failures = []
@@ -854,7 +854,7 @@ def export(
         opset = config.DEFAULT_ONNX_OPSET
 
     if "diffusers" in str(model.__class__) and not is_diffusers_available():
-        raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
+        raise ImportError("The pip package `diffusers` is required to export diffusion models to ONNX.")
 
     if not config.is_transformers_support_available:
         import transformers
@@ -912,7 +912,7 @@ def export(
 
 
 def onnx_export_from_model(
-    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"],
     output: Union[str, Path],
     opset: Optional[int] = None,
     optimize: Optional[str] = None,
@@ -999,15 +999,16 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
-    library_name = TasksManager._infer_library_from_model(model)
 
-    TasksManager.standardize_model_attributes(model, library_name)
+    TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type"):
         model_type = model.config.export_model_type.replace("_", "-")
     else:
         model_type = model.config.model_type.replace("_", "-")
 
+    library_name = TasksManager.infer_library_from_model(model)
+
     custom_architecture = library_name == "transformers" and model_type not in TasksManager._SUPPORTED_MODEL_TYPE
 
     if task is not None:
@@ -1191,7 +1192,7 @@ def onnx_export_from_model(
         optimizer.optimize(save_dir=output, optimization_config=optimization_config, file_suffix="")
 
     # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any
-    # TODO: treating stable diffusion separately is quite ugly
+    # TODO: treating diffusion separately is quite ugly
     if not no_post_process and library_name != "diffusers":
         try:
             logger.info("Post-processing the exported models...")
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index a1e9220ac05..fe3aef510e9 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -72,6 +72,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     FalconModelPatcher,
+    MistralModelPatcher,
     MusicgenModelPatcher,
     SAMModelPatcher,
     SentenceTransformersCLIPPatcher,
@@ -241,7 +242,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class GPT2OnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head")
 
 
@@ -263,7 +264,7 @@ class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
 
 
 class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
@@ -316,6 +317,11 @@ class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
@@ -604,7 +610,7 @@ def inputs_for_default_and_seq2seq_lm(self):
     def inputs_for_causal_lm(self):
         if self.use_past_in_inputs:
             common_inputs = {
-                "input_ids": {0: "batch_size"},
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "attention_mask": {0: "batch_size", 1: "past_sequence_length + 1"},
             }
             for i in range(self._normalized_config.decoder_num_layers):
@@ -649,7 +655,11 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs = super(OnnxConfigWithPast, self).outputs
             if self.use_past:
                 # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
-                for i in range(self._normalized_config.encoder_num_layers):
+                for i in range(
+                    self._normalized_config.encoder_num_layers
+                    if self.task != "text-generation"
+                    else self._normalized_config.decoder_num_layers
+                ):
                     common_outputs[f"present.{i}.key"] = {0: "batch_size", 2: "past_sequence_length + sequence_length"}
                     common_outputs[f"present.{i}.value"] = {
                         0: "batch_size",
@@ -890,6 +900,22 @@ class CLIPNormalizedConfig(NormalizedTextAndVisionConfig):
     VISION_CONFIG = "vision_config"
 
 
+class CLIPVisionModelOnnxConfig(VisionOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {"pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = super().outputs
+        common_outputs["last_hidden_state"] = {0: "batch_size"}
+        common_outputs["pooler_output"] = {0: "batch_size"}
+
+        return common_outputs
+
+
 class CLIPOnnxConfig(TextAndVisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig
     DEFAULT_ONNX_OPSET = 14
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 0a105343546..5e720d0cd7d 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -42,6 +42,9 @@
     _prepare_4d_causal_attention_mask_for_sdpa = None
     AttentionMaskConverter = None
 
+if _transformers_version >= version.parse("4.42"):
+    from transformers.cache_utils import SlidingWindowCache, StaticCache
+
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, TFPreTrainedModel
 
@@ -746,6 +749,20 @@ def patched_forward(
 
 
 class SentenceTransformersTransformerPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                _update_causal_mask_patched, self._model[0].auto_model
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                self._update_causal_mask_original, self._model[0].auto_model
+            )
+
     def __init__(
         self,
         config: "OnnxConfig",
@@ -754,6 +771,9 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._update_causal_mask_original = self._model[0].auto_model._update_causal_mask
+
         def patched_forward(input_ids, attention_mask):
             result = self.orig_forward({"input_ids": input_ids, "attention_mask": attention_mask})
 
@@ -931,3 +951,182 @@ def patched_forward(
                 return {"audio_values": audio_values}
 
             self.patched_forward = patched_forward
+
+
+def _update_causal_mask_patched(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values,
+    use_cache: bool,
+    output_attentions: bool,
+):
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+
+    # cache_position must be valid here no matter which cache we use
+    past_seen_tokens = cache_position[0] if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+    using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and not (using_static_cache or using_sliding_window_cache)
+        and not output_attentions
+    ):
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            sliding_window=self.config.sliding_window,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    min_dtype = torch.finfo(dtype).min
+    sequence_length = input_tensor.shape[1]
+    # SlidingWindowCache
+    if using_sliding_window_cache:
+        target_length = max(sequence_length, self.config.sliding_window)
+    # StaticCache
+    elif using_static_cache:
+        target_length = past_key_values.get_max_length()
+    # DynamicCache or no cache
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        if self.config.sliding_window is not None:
+            if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
+                # ---------------- NOTE: This part is patched -----------------------------
+                exclude_mask = torch.bitwise_or(
+                    exclude_mask,
+                    torch.arange(target_length, device=device)
+                    <= (cache_position.reshape(-1, 1) - self.config.sliding_window),
+                )
+                # ---------------- NOTE: patch end ----------------------------------------
+
+        causal_mask *= exclude_mask
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+    # if (
+    #     self.config._attn_implementation == "sdpa"
+    #     and attention_mask is not None
+    #     and attention_mask.device.type == "cuda"
+    #     and not output_attentions
+    # ):
+    #     # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+    #     # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+    #     # Details: https://github.com/pytorch/pytorch/issues/110213
+    #     causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+class MistralModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = _make_causal_mask_patched_staticmethod
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = _unmask_unattended_patched_staticmethod
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", _prepare_4d_causal_attention_mask_for_sdpa_patched
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    _update_causal_mask_patched, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(_update_causal_mask_patched, self._model)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = staticmethod(self.original_make_causal)
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = staticmethod(self.original_unmask_unattended)
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", self.original_prepare_4d_causal_attention_mask_for_sdpa
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    self._update_causal_mask_original, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(self._update_causal_mask_original, self._model)
+
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if _transformers_version >= version.parse("4.36"):
+            self.original_prepare_4d_causal_attention_mask_for_sdpa = _prepare_4d_causal_attention_mask_for_sdpa
+            self.original_unmask_unattended = AttentionMaskConverter._unmask_unattended
+
+        # TODO: Remove this if once transformers if much above 4.35
+        if AttentionMaskConverter is not None:
+            self.original_make_causal = AttentionMaskConverter._make_causal_mask
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._update_causal_mask_original = self._model.model._update_causal_mask
+            else:
+                self._update_causal_mask_original = self._model._update_causal_mask
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 8ecba9231f6..675566ba23e 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -34,6 +34,9 @@
 from ..utils import (
     get_decoder_models_for_export as _get_decoder_models_for_export,
 )
+from ..utils import (
+    get_diffusion_models_for_export as _get_diffusion_models_for_export,
+)
 from ..utils import (
     get_encoder_decoder_models_for_export as _get_encoder_decoder_models_for_export,
 )
@@ -43,9 +46,6 @@
 from ..utils import (
     get_speecht5_models_for_export as _get_speecht5_models_for_export,
 )
-from ..utils import (
-    get_stable_diffusion_models_for_export as _get_stable_diffusion_models_for_export,
-)
 
 
 logger = logging.get_logger()
@@ -68,7 +68,7 @@
         from transformers.modeling_tf_utils import TFPreTrainedModel
 
     if is_diffusers_available():
-        from diffusers import ModelMixin, StableDiffusionPipeline
+        from diffusers import DiffusionPipeline, ModelMixin
 
 
 MODEL_TYPES_REQUIRING_POSITION_IDS = {
@@ -219,13 +219,13 @@ def _get_submodels_and_onnx_configs(
 DEPRECATION_WARNING_GET_MODEL_FOR_EXPORT = "The usage of `optimum.exporters.onnx.utils.get_{model_type}_models_for_export` is deprecated and will be removed in a future release, please use `optimum.exporters.utils.get_{model_type}_models_for_export` instead."
 
 
-def get_stable_diffusion_models_for_export(
-    pipeline: "StableDiffusionPipeline",
+def get_diffusion_models_for_export(
+    pipeline: "DiffusionPipeline",
     int_dtype: str = "int64",
     float_dtype: str = "fp32",
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "ExportConfig"]]:
-    logger.warning(DEPRECATION_WARNING_GET_MODEL_FOR_EXPORT.format(model_type="stable_diffusion"))
-    return _get_stable_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter="onnx")
+    logger.warning(DEPRECATION_WARNING_GET_MODEL_FOR_EXPORT.format(model_type="diffusion"))
+    return _get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter="onnx")
 
 
 def get_sam_models_for_export(model: Union["PreTrainedModel", "TFPreTrainedModel"], config: "ExportConfig"):
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index c0ec64b044e..4bfe652a93a 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -15,8 +15,6 @@
 """Model export tasks manager."""
 
 import importlib
-import inspect
-import itertools
 import os
 import warnings
 from functools import partial
@@ -31,14 +29,12 @@
 from transformers import AutoConfig, PretrainedConfig, is_tf_available, is_torch_available
 from transformers.utils import SAFE_WEIGHTS_NAME, TF2_WEIGHTS_NAME, WEIGHTS_NAME, logging
 
-from ..utils import CONFIG_NAME
-from ..utils.import_utils import is_onnx_available
+from ..utils.import_utils import is_diffusers_available, is_onnx_available
 
 
 if TYPE_CHECKING:
     from .base import ExportConfig
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 if not is_torch_available() and not is_tf_available():
@@ -54,6 +50,14 @@
 if is_tf_available():
     from transformers import TFPreTrainedModel
 
+if is_diffusers_available():
+    from diffusers import DiffusionPipeline
+    from diffusers.pipelines.auto_pipeline import (
+        AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+        AUTO_INPAINT_PIPELINES_MAPPING,
+        AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+    )
+
 ExportConfigConstructor = Callable[[PretrainedConfig], "ExportConfig"]
 TaskNameToExportConfigDict = Dict[str, ExportConfigConstructor]
 
@@ -123,19 +127,45 @@ def supported_tasks_mapping(
     return mapping
 
 
-def get_model_loaders_to_tasks(tasks_to_model_loaders: Dict[str, Union[str, Tuple[str]]]) -> Dict[str, str]:
-    """
-    Reverses tasks_to_model_loaders while flattening the case where the same task maps to several
-    auto classes (e.g. automatic-speech-recognition).
-    """
-    model_loaders_to_tasks = {}
-    for task, model_loaders in tasks_to_model_loaders.items():
+def get_diffusers_tasks_to_model_mapping():
+    """task -> model mapping (model type -> model class)"""
+
+    tasks_to_model_mapping = {}
+
+    for task_name, model_mapping in (
+        ("text-to-image", AUTO_TEXT2IMAGE_PIPELINES_MAPPING),
+        ("image-to-image", AUTO_IMAGE2IMAGE_PIPELINES_MAPPING),
+        ("inpainting", AUTO_INPAINT_PIPELINES_MAPPING),
+    ):
+        tasks_to_model_mapping[task_name] = {}
+
+        for model_type, model_class in model_mapping.items():
+            tasks_to_model_mapping[task_name][model_type] = model_class.__name__
+
+    return tasks_to_model_mapping
+
+
+def get_transformers_tasks_to_model_mapping(tasks_to_model_loader, framework="pt"):
+    """task -> model mapping (model type -> model class)"""
+
+    if framework == "pt":
+        auto_modeling_module = importlib.import_module("transformers.models.auto.modeling_auto")
+    elif framework == "tf":
+        auto_modeling_module = importlib.import_module("transformers.models.auto.modeling_tf_auto")
+
+    tasks_to_model_mapping = {}
+    for task_name, model_loaders in tasks_to_model_loader.items():
         if isinstance(model_loaders, str):
-            model_loaders_to_tasks[model_loaders] = task
-        else:
-            model_loaders_to_tasks.update({model_loader_name: task for model_loader_name in model_loaders})
+            model_loaders = (model_loaders,)
 
-    return model_loaders_to_tasks
+        tasks_to_model_mapping[task_name] = {}
+        for model_loader in model_loaders:
+            model_loader_class = getattr(auto_modeling_module, model_loader, None)
+            if model_loader_class is not None:
+                # we can just update the model_type to model_class mapping since we only need one either way
+                tasks_to_model_mapping[task_name].update(model_loader_class._model_mapping._model_mapping)
+
+    return tasks_to_model_mapping
 
 
 class TasksManager:
@@ -149,10 +179,17 @@ class TasksManager:
     _TIMM_TASKS_TO_MODEL_LOADERS = {}
     _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = {}
 
+    # Torch model mappings
+    _TRANSFORMERS_TASKS_TO_MODEL_MAPPINGS = {}
+    _DIFFUSERS_TASKS_TO_MODEL_MAPPINGS = {}
+
     # TF model loaders
     _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS = {}
     _LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = {}
 
+    # TF model mappings
+    _TRANSFORMERS_TASKS_TO_MODEL_MAPPINGS = {}
+
     if is_torch_available():
         # Refer to https://huggingface.co/datasets/huggingface/transformers-metadata/blob/main/pipeline_tags.json
         # In case the same task (pipeline tag) may map to several loading classes, we use a tuple and the
@@ -166,7 +203,6 @@ class TasksManager:
             "audio-frame-classification": "AutoModelForAudioFrameClassification",
             "audio-xvector": "AutoModelForAudioXVector",
             "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
-            "conversational": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
             "depth-estimation": "AutoModelForDepthEstimation",
             "feature-extraction": "AutoModel",
             "fill-mask": "AutoModelForMaskedLM",
@@ -189,10 +225,9 @@ class TasksManager:
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
 
-        _DIFFUSERS_TASKS_TO_MODEL_LOADERS = {
-            "stable-diffusion": "StableDiffusionPipeline",
-            "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline",
-        }
+        _TRANSFORMERS_TASKS_TO_MODEL_MAPPINGS = get_transformers_tasks_to_model_mapping(
+            _TRANSFORMERS_TASKS_TO_MODEL_LOADERS, framework="pt"
+        )
 
         _TIMM_TASKS_TO_MODEL_LOADERS = {
             "image-classification": "create_model",
@@ -203,6 +238,15 @@ class TasksManager:
             "sentence-similarity": "SentenceTransformer",
         }
 
+        if is_diffusers_available():
+            _DIFFUSERS_TASKS_TO_MODEL_LOADERS = {
+                "image-to-image": "AutoPipelineForImage2Image",
+                "inpainting": "AutoPipelineForInpainting",
+                "text-to-image": "AutoPipelineForText2Image",
+            }
+
+            _DIFFUSERS_TASKS_TO_MODEL_MAPPINGS = get_diffusers_tasks_to_model_mapping()
+
         _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = {
             "diffusers": _DIFFUSERS_TASKS_TO_MODEL_LOADERS,
             "sentence_transformers": _SENTENCE_TRANSFORMERS_TASKS_TO_MODEL_LOADERS,
@@ -212,7 +256,6 @@ class TasksManager:
 
     if is_tf_available():
         _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS = {
-            "conversational": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
             "document-question-answering": "TFAutoModelForDocumentQuestionAnswering",
             "feature-extraction": "TFAutoModel",
             "fill-mask": "TFAutoModelForMaskedLM",
@@ -222,15 +265,12 @@ class TasksManager:
             "text-classification": "TFAutoModelForSequenceClassification",
             "token-classification": "TFAutoModelForTokenClassification",
             "multiple-choice": "TFAutoModelForMultipleChoice",
-            "object-detection": "TFAutoModelForObjectDetection",
             "question-answering": "TFAutoModelForQuestionAnswering",
             "image-segmentation": "TFAutoModelForImageSegmentation",
             "masked-im": "TFAutoModelForMaskedImageModeling",
             "semantic-segmentation": "TFAutoModelForSemanticSegmentation",
             "automatic-speech-recognition": "TFAutoModelForSpeechSeq2Seq",
             "audio-classification": "TFAutoModelForAudioClassification",
-            "audio-frame-classification": "TFAutoModelForAudioFrameClassification",
-            "audio-xvector": "TFAutoModelForAudioXVector",
             "image-to-text": "TFAutoModelForVision2Seq",
             "zero-shot-image-classification": "TFAutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "TFAutoModelForZeroShotObjectDetection",
@@ -240,6 +280,10 @@ class TasksManager:
             "transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS,
         }
 
+        _TRANSFORMERS_TASKS_TO_TF_MODEL_MAPPINGS = get_transformers_tasks_to_model_mapping(
+            _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS, framework="tf"
+        )
+
     _SYNONYM_TASK_MAP = {
         "audio-ctc": "automatic-speech-recognition",
         "causal-lm": "text-generation",
@@ -260,17 +304,11 @@ class TasksManager:
         "vision2seq-lm": "image-to-text",
         "zero-shot-classification": "text-classification",
         "image-feature-extraction": "feature-extraction",
-    }
-
-    # Reverse dictionaries str -> str, where several model loaders may map to the same task
-    _LIBRARY_TO_MODEL_LOADERS_TO_TASKS_MAP = {
-        "diffusers": get_model_loaders_to_tasks(_DIFFUSERS_TASKS_TO_MODEL_LOADERS),
-        "sentence_transformers": get_model_loaders_to_tasks(_SENTENCE_TRANSFORMERS_TASKS_TO_MODEL_LOADERS),
-        "timm": get_model_loaders_to_tasks(_TIMM_TASKS_TO_MODEL_LOADERS),
-        "transformers": get_model_loaders_to_tasks(_TRANSFORMERS_TASKS_TO_MODEL_LOADERS),
-    }
-    _LIBRARY_TO_TF_MODEL_LOADERS_TO_TASKS_MAP = {
-        "transformers": get_model_loaders_to_tasks(_TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS),
+        # for backward compatibility and testing (where
+        # model task and model type are still the same)
+        "lcm": "text-to-image",
+        "stable-diffusion": "text-to-image",
+        "stable-diffusion-xl": "text-to-image",
     }
 
     _CUSTOM_CLASSES = {
@@ -281,7 +319,6 @@ class TasksManager:
         ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"),
     }
 
-    # TODO: why feature-extraction-with-past is here?
     _ENCODER_DECODER_TASKS = (
         "automatic-speech-recognition",
         "document-question-answering",
@@ -448,6 +485,10 @@ class TasksManager:
             "zero-shot-image-classification",
             onnx="CLIPOnnxConfig",
         ),
+        "clip-vision-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="CLIPVisionModelOnnxConfig",
+        ),
         "codegen": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -1136,7 +1177,7 @@ class TasksManager:
         "vae-decoder",
         "clip-text-model",
         "clip-text-with-projection",
-        "trocr",  # TODO: why?
+        "trocr",  # supported through the vision-encoder-decoder model type
     }
     _SUPPORTED_CLI_MODEL_TYPE = (
         set(_SUPPORTED_MODEL_TYPE.keys())
@@ -1411,7 +1452,8 @@ def get_model_files(
             token = use_auth_token
 
         request_exception = None
-        full_model_path = Path(model_name_or_path) / subfolder
+        full_model_path = Path(model_name_or_path, subfolder)
+
         if full_model_path.is_dir():
             all_files = [
                 os.path.relpath(os.path.join(dirpath, file), full_model_path)
@@ -1431,23 +1473,18 @@ def get_model_files(
                 if subfolder != "":
                     all_files = [file[len(subfolder) + 1 :] for file in all_files if file.startswith(subfolder)]
             except (RequestsConnectionError, OfflineModeIsEnabled) as e:
-                request_exception = e
-                object_id = model_name_or_path.replace("/", "--")
-                full_model_path = Path(cache_dir, f"models--{object_id}")
-                if full_model_path.is_dir():  # explore the cache first
-                    # Resolve refs (for instance to convert main to the associated commit sha)
-                    if revision is None:
-                        revision_file = Path(full_model_path, "refs", "main")
-                        revision = ""
-                        if revision_file.is_file():
-                            with open(revision_file) as f:
-                                revision = f.read()
-                    cached_path = Path(full_model_path, "snapshots", revision, subfolder)
+                snapshot_path = huggingface_hub.snapshot_download(
+                    repo_id=model_name_or_path, revision=revision, cache_dir=cache_dir, token=token
+                )
+                full_model_path = Path(snapshot_path, subfolder)
+                if full_model_path.is_dir():
                     all_files = [
-                        os.path.relpath(os.path.join(dirpath, file), cached_path)
-                        for dirpath, _, filenames in os.walk(cached_path)
+                        os.path.relpath(os.path.join(dirpath, file), full_model_path)
+                        for dirpath, _, filenames in os.walk(full_model_path)
                         for file in filenames
                     ]
+                else:
+                    request_exception = e
 
         return all_files, request_exception
 
@@ -1455,8 +1492,9 @@ def get_model_files(
     def determine_framework(
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
-        framework: Optional[str] = None,
+        revision: Optional[str] = None,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
     ) -> str:
         """
         Determines the framework to use for the export.
@@ -1471,20 +1509,25 @@ def determine_framework(
             model_name_or_path (`Union[str, Path]`):
                 Can be either the model id of a model repo on the Hugging Face Hub, or a path to a local directory
                 containing a model.
-            subfolder (`str`, defaults to `""`):
+            subfolder (`str`, *optional*, defaults to `""`):
                 In case the model files are located inside a subfolder of the model directory / repo on the Hugging
                 Face Hub, you can specify the subfolder name here.
-            framework (`Optional[str]`, *optional*):
-                The framework to use for the export. See above for priority if none provided.
+            revision (`Optional[str]`,  defaults to `None`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, *optional*):
+                Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
 
         Returns:
             `str`: The framework to use for the export.
 
         """
-        if framework is not None:
-            return framework
 
-        all_files, request_exception = TasksManager.get_model_files(model_name_or_path, subfolder, cache_dir)
+        all_files, request_exception = TasksManager.get_model_files(
+            model_name_or_path, subfolder=subfolder, cache_dir=cache_dir, token=token, revision=revision
+        )
 
         pt_weight_name = Path(WEIGHTS_NAME).stem
         pt_weight_extension = Path(WEIGHTS_NAME).suffix
@@ -1507,7 +1550,7 @@ def determine_framework(
         elif "model_index.json" in all_files and any(
             file.endswith((pt_weight_extension, safe_weight_extension)) for file in all_files
         ):
-            # stable diffusion case
+            # diffusers case
             framework = "pt"
         elif "config_sentence_transformers.json" in all_files:
             # Sentence Transformers libary relies on PyTorch.
@@ -1538,58 +1581,67 @@ def determine_framework(
     @classmethod
     def _infer_task_from_model_or_model_class(
         cls,
-        model: Optional[Union["PreTrainedModel", "TFPreTrainedModel"]] = None,
-        model_class: Optional[Type] = None,
+        model: Optional[Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]] = None,
+        model_class: Optional[Type[Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]]] = None,
     ) -> str:
         if model is not None and model_class is not None:
             raise ValueError("Either a model or a model class must be provided, but both were given here.")
         if model is None and model_class is None:
             raise ValueError("Either a model or a model class must be provided, but none were given here.")
-        target_name = model.__class__.__name__ if model is not None else model_class.__name__
-        task_name = None
-        iterable = ()
-        for _, model_loader in cls._LIBRARY_TO_MODEL_LOADERS_TO_TASKS_MAP.items():
-            iterable += (model_loader.items(),)
-        for _, model_loader in cls._LIBRARY_TO_TF_MODEL_LOADERS_TO_TASKS_MAP.items():
-            iterable += (model_loader.items(),)
-
-        pt_auto_module = importlib.import_module("transformers.models.auto.modeling_auto")
-        tf_auto_module = importlib.import_module("transformers.models.auto.modeling_tf_auto")
-        for auto_cls_name, task in itertools.chain.from_iterable(iterable):
-            if any(
-                (
-                    target_name.startswith("Auto"),
-                    target_name.startswith("TFAuto"),
-                    "StableDiffusion" in target_name,
-                )
-            ):
-                if target_name == auto_cls_name:
-                    task_name = task
-                    break
-
-                continue
 
-            module = tf_auto_module if auto_cls_name.startswith("TF") else pt_auto_module
-            # getattr(module, auto_cls_name)._model_mapping is a _LazyMapping, it also has an attribute called
-            # "_model_mapping" that is what we want here: class names and not actual classes.
-            auto_cls = getattr(module, auto_cls_name, None)
-            # This is the case for StableDiffusionPipeline for instance.
-            if auto_cls is None:
-                continue
-            model_mapping = auto_cls._model_mapping._model_mapping
-            if target_name in model_mapping.values():
-                task_name = task
-                break
-        if task_name is None:
-            raise ValueError(f"Could not infer the task name for {target_name}.")
-
-        return task_name
+        target_class_name = model.__class__.__name__ if model is not None else model_class.__name__
+        target_class_module = model.__class__.__module__ if model is not None else model_class.__module__
+
+        # using TASKS_TO_MODEL_LOADERS to infer the task name
+        tasks_to_model_loaders = None
+
+        if target_class_name.startswith("AutoModel"):
+            tasks_to_model_loaders = cls._TRANSFORMERS_TASKS_TO_MODEL_LOADERS
+        elif target_class_name.startswith("TFAutoModel"):
+            tasks_to_model_loaders = cls._TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS
+        elif target_class_name.startswith("AutoPipeline"):
+            tasks_to_model_loaders = cls._DIFFUSERS_TASKS_TO_MODEL_LOADERS
+
+        if tasks_to_model_loaders is not None:
+            for task_name, model_loaders in tasks_to_model_loaders.items():
+                if isinstance(model_loaders, str):
+                    model_loaders = (model_loaders,)
+                for model_loader_class_name in model_loaders:
+                    if target_class_name == model_loader_class_name:
+                        return task_name
+
+        # using TASKS_TO_MODEL_MAPPINGS to infer the task name
+        tasks_to_model_mapping = None
+
+        if target_class_module.startswith("transformers"):
+            if target_class_name.startswith("TF"):
+                tasks_to_model_mapping = cls._TRANSFORMERS_TASKS_TO_TF_MODEL_MAPPINGS
+            else:
+                tasks_to_model_mapping = cls._TRANSFORMERS_TASKS_TO_MODEL_MAPPINGS
+        elif target_class_module.startswith("diffusers"):
+            tasks_to_model_mapping = cls._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS
+
+        if tasks_to_model_mapping is not None:
+            for task_name, model_mapping in tasks_to_model_mapping.items():
+                for model_type, model_class_name in model_mapping.items():
+                    if target_class_name == model_class_name:
+                        return task_name
+
+        raise ValueError(
+            "The task name could not be automatically inferred. If using the command-line, please provide the argument --task task-name. Example: `--task text-classification`."
+        )
 
     @classmethod
     def _infer_task_from_model_name_or_path(
-        cls, model_name_or_path: str, subfolder: str = "", revision: Optional[str] = None
+        cls,
+        model_name_or_path: str,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
     ) -> str:
         inferred_task_name = None
+
         is_local = os.path.isdir(os.path.join(model_name_or_path, subfolder))
 
         if is_local:
@@ -1603,70 +1655,78 @@ def _infer_task_from_model_name_or_path(
                     "Cannot infer the task from a model repo with a subfolder yet, please specify the task manually."
                 )
             try:
-                model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+                model_info = huggingface_hub.model_info(model_name_or_path, revision=revision, token=token)
             except (RequestsConnectionError, OfflineModeIsEnabled):
                 raise RuntimeError(
                     f"Hugging Face Hub is not reachable and we cannot infer the task from a cached model. Make sure you are not offline, or otherwise please specify the `task` (or `--task` in command-line) argument ({', '.join(TasksManager.get_all_tasks())})."
                 )
-            library_name = TasksManager.infer_library_from_model(model_name_or_path, subfolder, revision)
+            library_name = cls.infer_library_from_model(
+                model_name_or_path,
+                subfolder=subfolder,
+                revision=revision,
+                cache_dir=cache_dir,
+                token=token,
+            )
 
-            if library_name == "diffusers":
-                if model_info.config["diffusers"].get("class_name", None):
-                    class_name = model_info.config["diffusers"]["class_name"]
-                elif model_info.config["diffusers"].get("_class_name", None):
-                    class_name = model_info.config["diffusers"]["_class_name"]
-                else:
-                    raise ValueError(
-                        f"Could not automatically infer the class name for {model_name_or_path}. Please open an issue at https://github.com/huggingface/optimum/issues."
-                    )
-                inferred_task_name = "stable-diffusion-xl" if "StableDiffusionXL" in class_name else "stable-diffusion"
-            elif library_name == "timm":
+            if library_name == "timm":
                 inferred_task_name = "image-classification"
-            else:
-                pipeline_tag = getattr(model_info, "pipeline_tag", None)
-                # The Hub task "conversational" is not a supported task per se, just an alias that may map to
-                # text-generaton or text2text-generation.
-                # The Hub task "object-detection" is not a supported task per se, as in Transformers this may map to either
-                # zero-shot-object-detection or object-detection.
-                if pipeline_tag is not None and pipeline_tag not in ["conversational", "object-detection"]:
-                    inferred_task_name = TasksManager.map_from_synonym(model_info.pipeline_tag)
-                else:
-                    transformers_info = model_info.transformersInfo
-                    if transformers_info is not None and transformers_info.get("pipeline_tag") is not None:
-                        inferred_task_name = TasksManager.map_from_synonym(transformers_info["pipeline_tag"])
-                    else:
-                        # transformersInfo does not always have a pipeline_tag attribute
-                        class_name_prefix = ""
-                        if is_torch_available():
-                            tasks_to_automodels = TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP[library_name]
-                        else:
-                            tasks_to_automodels = TasksManager._LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP[library_name]
-                            class_name_prefix = "TF"
-
-                        auto_model_class_name = transformers_info["auto_model"]
-                        if not auto_model_class_name.startswith("TF"):
-                            auto_model_class_name = f"{class_name_prefix}{auto_model_class_name}"
-                        for task_name, class_name_for_task in tasks_to_automodels.items():
-                            if class_name_for_task == auto_model_class_name:
-                                inferred_task_name = task_name
+            elif library_name == "diffusers":
+                pipeline_tag = pipeline_tag = model_info.pipeline_tag
+                model_config = model_info.config
+                if pipeline_tag is not None:
+                    inferred_task_name = cls.map_from_synonym(pipeline_tag)
+                elif model_config is not None:
+                    if model_config is not None and model_config.get("diffusers", None) is not None:
+                        diffusers_class_name = model_config["diffusers"]["_class_name"]
+                        for task_name, model_mapping in cls._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS.items():
+                            for model_type, model_class_name in model_mapping.items():
+                                if diffusers_class_name == model_class_name:
+                                    inferred_task_name = task_name
+                                    break
+                            if inferred_task_name is not None:
+                                break
+            elif library_name == "transformers":
+                pipeline_tag = model_info.pipeline_tag
+                transformers_info = model_info.transformersInfo
+                if pipeline_tag is not None:
+                    inferred_task_name = cls.map_from_synonym(model_info.pipeline_tag)
+                elif transformers_info is not None:
+                    transformers_pipeline_tag = transformers_info.get("pipeline_tag", None)
+                    transformers_auto_model = transformers_info.get("auto_model", None)
+                    if transformers_pipeline_tag is not None:
+                        pipeline_tag = transformers_info["pipeline_tag"]
+                        inferred_task_name = cls.map_from_synonym(pipeline_tag)
+                    elif transformers_auto_model is not None:
+                        transformers_auto_model = transformers_auto_model.replace("TF", "")
+                        for task_name, model_loaders in cls._TRANSFORMERS_TASKS_TO_MODEL_LOADERS.items():
+                            if isinstance(model_loaders, str):
+                                model_loaders = (model_loaders,)
+                            for model_loader_class_name in model_loaders:
+                                if transformers_auto_model == model_loader_class_name:
+                                    inferred_task_name = task_name
+                                    break
+                            if inferred_task_name is not None:
                                 break
 
         if inferred_task_name is None:
-            raise KeyError(f"Could not find the proper task name for {auto_model_class_name}.")
+            raise KeyError(f"Could not find the proper task name for the model {model_name_or_path}.")
+
         return inferred_task_name
 
     @classmethod
     def infer_task_from_model(
         cls,
-        model: Union[str, "PreTrainedModel", "TFPreTrainedModel", Type],
+        model: Union[str, "PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline", Type],
         subfolder: str = "",
         revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
     ) -> str:
         """
-        Infers the task from the model repo.
+        Infers the task from the model repo, model instance, or model class.
 
         Args:
-            model (`str`):
+            model (`Union[str, PreTrainedModel, TFPreTrainedModel, DiffusionPipeline, Type]`):
                 The model to infer the task from. This can either be the name of a repo on the HuggingFace Hub, an
                 instance of a model, or a model class.
             subfolder (`str`, *optional*, defaults to `""`):
@@ -1674,64 +1734,82 @@ def infer_task_from_model(
                 Face Hub, you can specify the subfolder name here.
             revision (`Optional[str]`,  defaults to `None`):
                 Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, *optional*):
+                Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+
         Returns:
-            `str`: The task name automatically detected from the model repo.
+            `str`: The task name automatically detected from the HF hub repo, model instance, or model class.
         """
-        is_torch_pretrained_model = is_torch_available() and isinstance(model, PreTrainedModel)
-        is_tf_pretrained_model = is_tf_available() and isinstance(model, TFPreTrainedModel)
-        task = None
+        inferred_task_name = None
+
         if isinstance(model, str):
-            task = cls._infer_task_from_model_name_or_path(model, subfolder=subfolder, revision=revision)
-        elif is_torch_pretrained_model or is_tf_pretrained_model:
-            task = cls._infer_task_from_model_or_model_class(model=model)
-        elif inspect.isclass(model):
-            task = cls._infer_task_from_model_or_model_class(model_class=model)
+            inferred_task_name = cls._infer_task_from_model_name_or_path(
+                model_name_or_path=model,
+                subfolder=subfolder,
+                revision=revision,
+                cache_dir=cache_dir,
+                token=token,
+            )
+        elif type(model) == type:
+            inferred_task_name = cls._infer_task_from_model_or_model_class(model_class=model)
+        else:
+            inferred_task_name = cls._infer_task_from_model_or_model_class(model=model)
 
-        if task is None:
-            raise ValueError(f"Could not infer the task from {model}.")
+        if inferred_task_name is None:
+            raise ValueError(
+                "The task name could not be automatically inferred. If using the command-line, please provide the argument --task task-name. Example: `--task text-classification`."
+            )
 
-        return task
+        return inferred_task_name
 
-    @staticmethod
-    def _infer_library_from_model(
-        model: Union["PreTrainedModel", "TFPreTrainedModel"], library_name: Optional[str] = None
+    @classmethod
+    def _infer_library_from_model_or_model_class(
+        cls,
+        model: Optional[Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]] = None,
+        model_class: Optional[Type[Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]]] = None,
     ):
-        if library_name is not None:
-            return library_name
+        if model is not None and model_class is not None:
+            raise ValueError("Either a model or a model class must be provided, but both were given here.")
+        if model is None and model_class is None:
+            raise ValueError("Either a model or a model class must be provided, but none were given here.")
+
+        target_class_module = model.__class__.__module__ if model is not None else model_class.__module__
 
-        # SentenceTransformer models have no config attributes
-        if hasattr(model, "_model_config"):
+        if target_class_module.startswith("sentence_transformers"):
             library_name = "sentence_transformers"
-        elif (
-            hasattr(model, "pretrained_cfg")
-            or hasattr(model.config, "pretrained_cfg")
-            or hasattr(model.config, "architecture")
-        ):
-            library_name = "timm"
-        elif hasattr(model.config, "_diffusers_version") or getattr(model, "config_name", "") == "model_index.json":
-            library_name = "diffusers"
-        else:
+        elif target_class_module.startswith("transformers"):
             library_name = "transformers"
+        elif target_class_module.startswith("diffusers"):
+            library_name = "diffusers"
+        elif target_class_module.startswith("timm"):
+            library_name = "timm"
+
+        if library_name is None:
+            raise ValueError(
+                "The library name could not be automatically inferred. If using the command-line, please provide the argument --library {transformers,diffusers,timm,sentence_transformers}. Example: `--library diffusers`."
+            )
+
         return library_name
 
     @classmethod
-    def infer_library_from_model(
+    def _infer_library_from_model_name_or_path(
         cls,
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
         revision: Optional[str] = None,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        library_name: Optional[str] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
     ):
         """
-        Infers the library from the model repo.
+        Infers the library from the model name or path.
 
         Args:
             model_name_or_path (`str`):
-                The model to infer the task from. This can either be the name of a repo on the HuggingFace Hub, an
-                instance of a model, or a model class.
+                The model to infer the task from. This can either be the name of a repo on the HuggingFace Hub, or a path
+                to a local directory containing the model.
             subfolder (`str`, defaults to `""`):
                 In case the model files are located inside a subfolder of the model directory / repo on the Hugging
                 Face Hub, you can specify the subfolder name here.
@@ -1739,10 +1817,6 @@ def infer_library_from_model(
                 Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
             cache_dir (`Optional[str]`, *optional*):
                 Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
-            library_name (`Optional[str]`, *optional*):
-                The library name of the model. Can be any of "transformers", "timm", "diffusers", "sentence_transformers".
-            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
-                Deprecated. Please use the `token` argument instead.
             token (`Optional[Union[bool,str]]`, defaults to `None`):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
@@ -1751,72 +1825,64 @@ def infer_library_from_model(
             `str`: The library name automatically detected from the model repo.
         """
 
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
-        if library_name is not None:
-            return library_name
+        inferred_library_name = None
 
         all_files, _ = TasksManager.get_model_files(
-            model_name_or_path, subfolder, cache_dir, token=token, revision=revision
+            model_name_or_path,
+            subfolder=subfolder,
+            cache_dir=cache_dir,
+            revision=revision,
+            token=token,
         )
 
         if "model_index.json" in all_files:
-            library_name = "diffusers"
+            inferred_library_name = "diffusers"
         elif (
             any(file_path.startswith("sentence_") for file_path in all_files)
             or "config_sentence_transformers.json" in all_files
         ):
-            library_name = "sentence_transformers"
-        elif CONFIG_NAME in all_files:
-            # We do not use PretrainedConfig.from_pretrained which has unwanted warnings about model type.
+            inferred_library_name = "sentence_transformers"
+        elif "config.json" in all_files:
             kwargs = {
                 "subfolder": subfolder,
                 "revision": revision,
                 "cache_dir": cache_dir,
                 "token": token,
             }
+            # We do not use PretrainedConfig.from_pretrained which has unwanted warnings about model type.
             config_dict, kwargs = PretrainedConfig.get_config_dict(model_name_or_path, **kwargs)
             model_config = PretrainedConfig.from_dict(config_dict, **kwargs)
 
             if hasattr(model_config, "pretrained_cfg") or hasattr(model_config, "architecture"):
-                library_name = "timm"
+                inferred_library_name = "timm"
             elif hasattr(model_config, "_diffusers_version"):
-                library_name = "diffusers"
+                inferred_library_name = "diffusers"
             else:
-                library_name = "transformers"
-        else:
-            library_name = "transformers"
+                inferred_library_name = "transformers"
 
-        if library_name is None:
+        if inferred_library_name is None:
             raise ValueError(
                 "The library name could not be automatically inferred. If using the command-line, please provide the argument --library {transformers,diffusers,timm,sentence_transformers}. Example: `--library diffusers`."
             )
 
-        return library_name
+        return inferred_library_name
 
     @classmethod
-    def standardize_model_attributes(
+    def infer_library_from_model(
         cls,
-        model: Union["PreTrainedModel", "TFPreTrainedModel"],
-        library_name: Optional[str] = None,
+        model: Union[str, "PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline", Type],
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
     ):
         """
-        Updates the model for export. This function is suitable to make required changes to the models from different
-        libraries to follow transformers style.
+        Infers the library from the model repo, model instance, or model class.
 
         Args:
-            model_name_or_path (`Union[str, Path]`):
-                Can be either the model id of a model repo on the Hugging Face Hub, or a path to a local directory
-                containing a model.
-            model (`Union[PreTrainedModel, TFPreTrainedModel]`):
-                The instance of the model.
+            model (`Union[str, PreTrainedModel, TFPreTrainedModel, DiffusionPipeline, Type]`):
+                The model to infer the task from. This can either be the name of a repo on the HuggingFace Hub, an
+                instance of a model, or a model class.
             subfolder (`str`, defaults to `""`):
                 In case the model files are located inside a subfolder of the model directory / repo on the Hugging
                 Face Hub, you can specify the subfolder name here.
@@ -1824,20 +1890,66 @@ def standardize_model_attributes(
                 Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
             cache_dir (`Optional[str]`, *optional*):
                 Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
-            library_name (`Optional[str]`, *optional*)::
-                The library name of the model. Can be any of "transformers", "timm", "diffusers", "sentence_transformers".
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+
+        Returns:
+            `str`: The library name automatically detected from the model repo, model instance, or model class.
         """
-        library_name = TasksManager._infer_library_from_model(model, library_name)
+
+        if isinstance(model, str):
+            library_name = cls._infer_library_from_model_name_or_path(
+                model_name_or_path=model,
+                subfolder=subfolder,
+                revision=revision,
+                cache_dir=cache_dir,
+                token=token,
+            )
+        elif type(model) == type:
+            library_name = cls._infer_library_from_model_or_model_class(model_class=model)
+        else:
+            library_name = cls._infer_library_from_model_or_model_class(model=model)
+
+        return library_name
+
+    @classmethod
+    def standardize_model_attributes(cls, model: Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]):
+        """
+        Updates the model for export. This function is suitable to make required changes to the models from different
+        libraries to follow transformers style.
+
+        Args:
+            model (`Union[PreTrainedModel, TFPreTrainedModel, DiffusionPipeline]`):
+                The instance of the model.
+
+        """
+
+        library_name = TasksManager.infer_library_from_model(model)
 
         if library_name == "diffusers":
-            model.config.export_model_type = "stable-diffusion"
-        elif library_name == "timm":
-            # Retrieve model config
-            model_config = PretrainedConfig.from_dict(model.pretrained_cfg)
+            inferred_model_type = None
+
+            for task_name, model_mapping in cls._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS.items():
+                for model_type, model_class_name in model_mapping.items():
+                    if model.__class__.__name__ == model_class_name:
+                        inferred_model_type = model_type
+                        break
+                if inferred_model_type is not None:
+                    break
+
+            if inferred_model_type is None:
+                raise ValueError(
+                    f"The export of a DiffusionPipeline model with the class name {model.__class__.__name__} is currently not supported in Optimum. "
+                    "Please open an issue or submit a PR to add the support."
+                )
 
-            # Set config as in transformers
-            setattr(model, "config", model_config)
+            # `model_type` is a class attribute in Transformers, let's avoid modifying it.
+            model.config.export_model_type = inferred_model_type
 
+        elif library_name == "timm":
+            # Retrieve model config and set it like in transformers
+            model.config = PretrainedConfig.from_dict(model.pretrained_cfg)
             # `model_type` is a class attribute in Transformers, let's avoid modifying it.
             model.config.export_model_type = model.pretrained_cfg["architecture"]
 
@@ -1881,13 +1993,14 @@ def get_model_from_task(
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
         revision: Optional[str] = None,
-        framework: Optional[str] = None,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
+        framework: Optional[str] = None,
         torch_dtype: Optional["torch.dtype"] = None,
         device: Optional[Union["torch.device", str]] = None,
-        library_name: str = None,
+        library_name: Optional[str] = None,
         **model_kwargs,
-    ) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
+    ) -> Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"]:
         """
         Retrieves a model from its name and the task to be enabled.
 
@@ -1902,34 +2015,44 @@ def get_model_from_task(
                 Face Hub, you can specify the subfolder name here.
             revision (`Optional[str]`, *optional*):
                 Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, *optional*):
+                Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
             framework (`Optional[str]`, *optional*):
                 The framework to use for the export. See `TasksManager.determine_framework` for the priority should
                 none be provided.
-            cache_dir (`Optional[str]`, *optional*):
-                Path to a directory in which a downloaded pretrained model weights have been cached if the standard cache should not be used.
             torch_dtype (`Optional[torch.dtype]`, defaults to `None`):
                 Data type to load the model on. PyTorch-only argument.
             device (`Optional[torch.device]`, defaults to `None`):
                 Device to initialize the model on. PyTorch-only argument. For PyTorch, defaults to "cpu".
-            model_kwargs (`Dict[str, Any]`, *optional*):
-                Keyword arguments to pass to the model `.from_pretrained()` method.
             library_name (`Optional[str]`, defaults to `None`):
                 The library name of the model. Can be any of "transformers", "timm", "diffusers", "sentence_transformers". See `TasksManager.infer_library_from_model` for the priority should
                 none be provided.
+            model_kwargs (`Dict[str, Any]`, *optional*):
+                Keyword arguments to pass to the model `.from_pretrained()` method.
 
         Returns:
             The instance of the model.
 
         """
-        framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
+
+        if framework is None:
+            framework = TasksManager.determine_framework(
+                model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
+
+        if library_name is None:
+            library_name = TasksManager.infer_library_from_model(
+                model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         original_task = task
         if task == "auto":
-            task = TasksManager.infer_task_from_model(model_name_or_path, subfolder=subfolder, revision=revision)
-
-        library_name = TasksManager.infer_library_from_model(
-            model_name_or_path, subfolder, revision, cache_dir, library_name
-        )
+            task = TasksManager.infer_task_from_model(
+                model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         model_type = None
         model_class_name = None
@@ -2004,7 +2127,7 @@ def get_model_from_task(
                     kwargs["from_pt"] = True
                     model = model_class.from_pretrained(model_name_or_path, **kwargs)
 
-        TasksManager.standardize_model_attributes(model, library_name)
+        TasksManager.standardize_model_attributes(model)
 
         return model
 
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index 74d2d983850..902dd89f777 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -43,6 +43,18 @@
             f"We found an older version of diffusers {_diffusers_version} but we require diffusers to be >= {DIFFUSERS_MINIMUM_VERSION}. "
             "Please update diffusers by running `pip install --upgrade diffusers`"
         )
+
+    from diffusers import (
+        DiffusionPipeline,
+        LatentConsistencyModelImg2ImgPipeline,
+        LatentConsistencyModelPipeline,
+        StableDiffusionImg2ImgPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLPipeline,
+    )
     from diffusers.models.attention_processor import (
         Attention,
         AttnAddedKVProcessor,
@@ -53,6 +65,7 @@
         LoRAAttnProcessor2_0,
     )
 
+
 if TYPE_CHECKING:
     from .base import ExportConfig
 
@@ -63,7 +76,7 @@
         from transformers.modeling_tf_utils import TFPreTrainedModel
 
     if is_diffusers_available():
-        from diffusers import ModelMixin, StableDiffusionPipeline
+        from diffusers import DiffusionPipeline, ModelMixin
 
 
 ENCODER_NAME = "encoder_model"
@@ -72,23 +85,40 @@
 DECODER_MERGED_NAME = "decoder_model_merged"
 
 
-def _get_submodels_for_export_stable_diffusion(
-    pipeline: "StableDiffusionPipeline",
+def _get_submodels_for_export_diffusion(
+    pipeline: "DiffusionPipeline",
 ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
     """
     Returns the components of a Stable Diffusion model.
     """
-    from diffusers import StableDiffusionXLImg2ImgPipeline
 
-    models_for_export = {}
-    if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline):
+    is_stable_diffusion = isinstance(
+        pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline)
+    )
+    is_stable_diffusion_xl = isinstance(
+        pipeline, (StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline)
+    )
+    is_latent_consistency_model = isinstance(
+        pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline)
+    )
+
+    if is_stable_diffusion_xl:
         projection_dim = pipeline.text_encoder_2.config.projection_dim
-    else:
+    elif is_stable_diffusion:
         projection_dim = pipeline.text_encoder.config.projection_dim
+    elif is_latent_consistency_model:
+        projection_dim = pipeline.text_encoder.config.projection_dim
+    else:
+        raise ValueError(
+            f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. "
+            "Please open an issue or submit a PR to add the support."
+        )
+
+    models_for_export = {}
 
     # Text encoder
     if pipeline.text_encoder is not None:
-        if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline):
+        if is_stable_diffusion_xl:
             pipeline.text_encoder.config.output_hidden_states = True
         models_for_export["text_encoder"] = pipeline.text_encoder
 
@@ -97,6 +127,7 @@ def _get_submodels_for_export_stable_diffusion(
     is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0")
     if not is_torch_greater_or_equal_than_2_1:
         pipeline.unet.set_attn_processor(AttnProcessor())
+
     pipeline.unet.config.text_encoder_projection_dim = projection_dim
     # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
     # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
@@ -258,17 +289,17 @@ def get_decoder_models_for_export(
     return models_for_export
 
 
-def get_stable_diffusion_models_for_export(
-    pipeline: "StableDiffusionPipeline",
+def get_diffusion_models_for_export(
+    pipeline: "DiffusionPipeline",
     int_dtype: str = "int64",
     float_dtype: str = "fp32",
     exporter: str = "onnx",
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "ExportConfig"]]:
     """
-    Returns the components of a Stable Diffusion model and their subsequent export configs.
+    Returns the components of a Diffusion model and their subsequent export configs.
 
     Args:
-        pipeline ([`StableDiffusionPipeline`]):
+        pipeline ([`DiffusionPipeline`]):
             The model to export.
         int_dtype (`str`, defaults to `"int64"`):
             The data type of integer tensors, could be ["int64", "int32", "int8"], default to "int64".
@@ -279,7 +310,7 @@ def get_stable_diffusion_models_for_export(
         `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `ExportConfig`]: A Dict containing the model and
         export configs for the different components of the model.
     """
-    models_for_export = _get_submodels_for_export_stable_diffusion(pipeline)
+    models_for_export = _get_submodels_for_export_diffusion(pipeline)
 
     # Text encoder
     if "text_encoder" in models_for_export:
@@ -505,7 +536,7 @@ def override_diffusers_2_0_attn_processors(model):
 
 
 def _get_submodels_and_export_configs(
-    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"],
     task: str,
     monolith: bool,
     custom_export_configs: Dict,
@@ -523,7 +554,7 @@ def _get_submodels_and_export_configs(
     if not custom_architecture:
         if library_name == "diffusers":
             export_config = None
-            models_and_export_configs = get_stable_diffusion_models_for_export(
+            models_and_export_configs = get_diffusion_models_for_export(
                 model, int_dtype=int_dtype, float_dtype=float_dtype, exporter=exporter
             )
         else:
@@ -575,7 +606,7 @@ def _get_submodels_and_export_configs(
             submodels_for_export = fn_get_submodels(model)
         else:
             if library_name == "diffusers":
-                submodels_for_export = _get_submodels_for_export_stable_diffusion(model)
+                submodels_for_export = _get_submodels_for_export_diffusion(model)
             elif (
                 model.config.is_encoder_decoder
                 and task.startswith(TasksManager._ENCODER_DECODER_TASKS)
@@ -599,7 +630,7 @@ def _get_submodels_and_export_configs(
         for key, custom_export_config in custom_export_configs.items():
             models_and_export_configs[key] = (submodels_for_export[key], custom_export_config)
 
-    # Default to the first ONNX config for stable-diffusion and custom architecture case.
+    # Default to the first ONNX config for diffusion and custom architecture case.
     if export_config is None:
         export_config = next(iter(models_and_export_configs.values()))[1]
 
diff --git a/optimum/gptq/eval.py b/optimum/gptq/eval.py
new file mode 100644
index 00000000000..3ae6e4d7bf5
--- /dev/null
+++ b/optimum/gptq/eval.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+def evaluate_perplexity(model, tokenizer):
+    def _perplexity(nlls, n_samples, seqlen):
+        return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
+
+    # load and prepare dataset
+    data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
+    data = data.input_ids.to(model.device)
+
+    seqlen = 512
+    model = model.eval()
+    n_samples = data.numel() // seqlen
+
+    nlls = []
+
+    with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
+        for i in progress_bar:
+            start_index = i * seqlen
+            end_index = (i + 1) * seqlen
+            batch = data[:, start_index:end_index].to(model.device)
+            with torch.no_grad():
+                logits = model(batch).logits
+            shift_logits = logits[:, :-1, :].contiguous().float()
+            shift_labels = data[:, start_index:end_index][:, 1:]
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            neg_log_likelihood = loss.float() * seqlen
+            nlls.append(neg_log_likelihood)
+
+            curr_ppl = _perplexity(nlls, i + 1, seqlen)
+            progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
+
+    ppl = _perplexity(nlls, n_samples, seqlen)
+
+    return ppl.item()
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 74b05d5b151..5bab0622de4 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -47,8 +47,8 @@
                     user or organization name, like `dbmdz/bert-base-german-cased`.
                 - A path to a *directory* containing a model saved using [`~OptimizedModel.save_pretrained`],
                     e.g., `./my_model_directory/`.
-        from_transformers (`bool`, defaults to `False`):
-            Defines whether the provided `model_id` contains a vanilla Transformers checkpoint.
+        export (`bool`, defaults to `False`):
+            Defines whether the provided `model_id` needs to be exported to the targeted format.
         force_download (`bool`, defaults to `True`):
             Whether or not to force the (re-)download of the model weights and configuration files, overriding the
             cached versions if they exist.
@@ -71,6 +71,10 @@
             Whether or not to allow for custom code defined on the Hub in their own modeling. This option should only be set
             to `True` for repositories you trust and in which you have read the code, as it will execute code present on
             the Hub on your local machine.
+        revision (`Optional[str]`, defaults to `None`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
 """
 
 
@@ -415,13 +419,6 @@ def from_pretrained(
                 trust_remote_code=trust_remote_code,
             )
 
-        if not export and trust_remote_code:
-            logger.warning(
-                "The argument `trust_remote_code` is to be used along with export=True. It will be ignored."
-            )
-        elif export and trust_remote_code is None:
-            trust_remote_code = False
-
         from_pretrained_method = cls._from_transformers if export else cls._from_pretrained
 
         return from_pretrained_method(
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index 23ca6e5e6a6..caa662f3824 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -49,6 +49,11 @@ def __init__(
             label_names (`List[str]`, `optional`):
                 The list of keys in your dictionary of inputs that correspond to the labels.
         """
+
+        logger.warning(
+            "The class `optimum.onnxruntime.model.ORTModel` is deprecated and will be removed in the next release."
+        )
+
         self.compute_metrics = compute_metrics
         self.label_names = ["labels"] if label_names is None else label_names
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index fd7e741d7c0..6a0dcbba2f0 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -121,6 +121,7 @@ class ORTModelForCausalLM(ORTModel, GenerationMixin):
 
     auto_model_class = AutoModelForCausalLM
     main_input_name = "input_ids"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index b3bad65954d..126b1e65366 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1091,10 +1091,11 @@ def forward(
             onnx_outputs = self.model.run(None, onnx_inputs)
             model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
-            # TODO: why do we only return last_hidden_state? why not all outputs?
-            # that way, there will be less need for ORTModelForCustomTask in cases where
-            # we just want to extend model outputs with attentions, hidden_states, etc.
-            last_hidden_state = model_outputs["last_hidden_state"]
+            if "last_hidden_state" in self.output_names:
+                last_hidden_state = model_outputs["last_hidden_state"]
+            else:
+                # TODO: This allows to support sentence-transformers models (sentence embedding), but is not validated.
+                last_hidden_state = next(iter(model_outputs.values()))
 
         # converts output to namedtuple for pipelines post-processing
         return BaseModelOutput(last_hidden_state=last_hidden_state)
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 89a0ae44d58..3b1af05d0f5 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -570,6 +570,7 @@ class ORTModelForConditionalGeneration(ORTModel, ABC):
 
     # Used in from_transformers to export model to onnxORTEncoder
     base_model_prefix = "onnx_model"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/onnxruntime/subpackage/commands/optimize.py b/optimum/onnxruntime/subpackage/commands/optimize.py
index 1dd82f0ee22..aee0ed49515 100644
--- a/optimum/onnxruntime/subpackage/commands/optimize.py
+++ b/optimum/onnxruntime/subpackage/commands/optimize.py
@@ -87,7 +87,7 @@ def run(self):
         optimizer = ORTOptimizer.from_pretrained(self.args.onnx_model, file_names)
 
         if self.args.config:
-            optimization_config = ORTConfig
+            optimization_config = ORTConfig.from_pretrained(self.args.config).optimization
         elif self.args.O1:
             optimization_config = AutoOptimizationConfig.O1()
         elif self.args.O2:
@@ -97,6 +97,6 @@ def run(self):
         elif self.args.O4:
             optimization_config = AutoOptimizationConfig.O4()
         else:
-            optimization_config = ORTConfig.from_pretained(self.args.config).optimization
+            raise ValueError("Either -O1, -O2, -O3, -O4 or -c must be specified.")
 
         optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 37d0feefcc4..ad40af92b9d 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -17,11 +17,15 @@
 import re
 from enum import Enum
 from inspect import signature
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from packaging import version
+from tqdm import tqdm
+from transformers import EvalPrediction
+from transformers.trainer_pt_utils import nested_concat
+from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import logging
 
 import onnxruntime as ort
@@ -30,6 +34,12 @@
 from ..utils.import_utils import _is_package_available
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+    from .modeling_ort import ORTModel
+
+
 logger = logging.get_logger(__name__)
 
 ONNX_WEIGHTS_NAME = "model.onnx"
@@ -341,3 +351,53 @@ class ORTQuantizableOperator(Enum):
     Resize = "Resize"
     AveragePool = "AveragePool"
     Concat = "Concat"
+
+
+def evaluation_loop(
+    model: "ORTModel",
+    dataset: "Dataset",
+    label_names: Optional[List[str]] = None,
+    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+):
+    """
+    Run evaluation and returns metrics and predictions.
+
+    Args:
+        model (`ORTModel`):
+            The ONNXRuntime model to use for the evaluation step.
+        dataset (`datasets.Dataset`):
+            Dataset to use for the evaluation step.
+        label_names (`List[str]`, `optional`):
+            The list of keys in your dictionary of inputs that correspond to the labels.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, `optional`):
+            The function that will be used to compute metrics at evaluation. Must take an `EvalPrediction` and
+            return a dictionary string to metric values.
+    """
+
+    all_preds = None
+    all_labels = None
+
+    for inputs in tqdm(dataset, desc="Evaluation"):
+        has_labels = all(inputs.get(k) is not None for k in label_names)
+        if has_labels:
+            labels = tuple(np.array([inputs.get(name)]) for name in label_names)
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        inputs = {key: np.array([inputs[key]]) for key in model.input_names if key in inputs}
+        preds = model(**inputs)
+
+        if len(preds) == 1:
+            preds = preds[0]
+
+        all_preds = preds if all_preds is None else nested_concat(all_preds, preds, padding_index=-100)
+        all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+    if compute_metrics is not None and all_preds is not None and all_labels is not None:
+        metrics = compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+    else:
+        metrics = {}
+
+    return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index a5df9e26245..4a57fda79ce 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -50,7 +50,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 
 TORCH_MINIMUM_VERSION = version.parse("1.11.0")
 TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
-DIFFUSERS_MINIMUM_VERSION = version.parse("0.18.0")
+DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
 AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99")  # Allows 0.5.0.dev0
 
 
diff --git a/optimum/version.py b/optimum/version.py
index 6deb421ee56..8eeeb9d05a7 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.21.0.dev0"
+__version__ = "1.22.0.dev0"
diff --git a/setup.py b/setup.py
index 6b28fb696be..41598aeba5f 100644
--- a/setup.py
+++ b/setup.py
@@ -15,10 +15,10 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.42.0",
+    "transformers[sentencepiece]>=4.26.0,<4.43.0",
     "torch>=1.11",
     "packaging",
-    "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
+    "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
     "huggingface_hub>=0.8.0",
     "datasets",
 ]
@@ -75,11 +75,12 @@
         "transformers[sentencepiece]>=4.26.0,<4.38.0",
     ],
     "diffusers": ["diffusers"],
-    "intel": "optimum-intel>=1.16.0",
-    "openvino": "optimum-intel[openvino]>=1.16.0",
-    "nncf": "optimum-intel[nncf]>=1.16.0",
-    "neural-compressor": "optimum-intel[neural-compressor]>=1.16.0",
-    "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"],
+    "intel": "optimum-intel>=1.18.0",
+    "openvino": "optimum-intel[openvino]>=1.18.0",
+    "nncf": "optimum-intel[nncf]>=1.18.0",
+    "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
+    "ipex": "optimum-intel[ipex]>=1.18.0",
+    "habana": ["optimum-habana", "transformers >= 4.40.0, < 4.41.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "graphcore": "optimum-graphcore",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 400328fae1f..0ef7779cb55 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -56,6 +56,7 @@
     "bloom": "hf-internal-testing/tiny-random-BloomModel",
     "camembert": "hf-internal-testing/tiny-random-camembert",
     "clip": "hf-internal-testing/tiny-random-CLIPModel",
+    "clip-vision-model": "fxmarty/clip-vision-model-tiny",
     "convbert": "hf-internal-testing/tiny-random-ConvBertModel",
     "convnext": "hf-internal-testing/tiny-random-convnext",
     "convnextv2": "hf-internal-testing/tiny-random-ConvNextV2Model",
@@ -294,9 +295,10 @@
     "roberta": "roberta-base",
 }
 
-PYTORCH_STABLE_DIFFUSION_MODEL = {
+PYTORCH_DIFFUSION_MODEL = {
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
+    "lcm": "echarlaix/tiny-random-latent-consistency",
 }
 
 PYTORCH_REMOTE_CODE_MODELS = {
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index a55b5dc8108..d4584bd06e0 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -41,9 +41,9 @@
 
 from ..exporters_utils import (
     NO_DYNAMIC_AXES_EXPORT_SHAPES_TRANSFORMERS,
+    PYTORCH_DIFFUSION_MODEL,
     PYTORCH_EXPORT_MODELS_TINY,
     PYTORCH_SENTENCE_TRANSFORMERS_MODEL,
-    PYTORCH_STABLE_DIFFUSION_MODEL,
     PYTORCH_TIMM_MODEL,
     PYTORCH_TIMM_MODEL_NO_DYNAMIC_AXES,
     PYTORCH_TRANSFORMERS_MODEL_NO_DYNAMIC_AXES,
@@ -253,29 +253,29 @@ def _onnx_export_no_dynamic_axes(
             except MinimumVersionError as e:
                 pytest.skip(f"Skipping due to minimum version requirements not met. Full error: {e}")
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch
     @require_vision
     @require_diffusers
-    def test_exporters_cli_pytorch_cpu_stable_diffusion(self, model_type: str, model_name: str):
+    def test_exporters_cli_pytorch_cpu_diffusion(self, model_type: str, model_name: str):
         self._onnx_export(model_name, model_type)
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch_gpu
     @require_vision
     @require_diffusers
     @slow
     @pytest.mark.run_slow
-    def test_exporters_cli_pytorch_gpu_stable_diffusion(self, model_type: str, model_name: str):
+    def test_exporters_cli_pytorch_gpu_diffusion(self, model_type: str, model_name: str):
         self._onnx_export(model_name, model_type, device="cuda")
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch_gpu
     @require_vision
     @require_diffusers
     @slow
     @pytest.mark.run_slow
-    def test_exporters_cli_fp16_stable_diffusion(self, model_type: str, model_name: str):
+    def test_exporters_cli_fp16_diffusion(self, model_type: str, model_name: str):
         self._onnx_export(model_name, model_type, device="cuda", fp16=True)
 
     @parameterized.expand(
@@ -595,7 +595,7 @@ def test_trust_remote_code(self):
                 check=True,
             )
 
-    def test_stable_diffusion(self):
+    def test_diffusion(self):
         with TemporaryDirectory() as tmpdirname:
             subprocess.run(
                 f"python3 -m optimum.exporters.onnx --model hf-internal-testing/tiny-stable-diffusion-torch --task stable-diffusion {tmpdirname}",
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index 46ace4157a6..6065100f0df 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -32,8 +32,8 @@
     OnnxConfigWithPast,
     export_models,
     get_decoder_models_for_export,
+    get_diffusion_models_for_export,
     get_encoder_decoder_models_for_export,
-    get_stable_diffusion_models_for_export,
     main_export,
     onnx_export_from_model,
     validate_models_outputs,
@@ -48,6 +48,7 @@
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
 from ..exporters_utils import (
+    PYTORCH_DIFFUSION_MODEL,
     PYTORCH_EXPORT_MODELS_TINY,
     PYTORCH_SENTENCE_TRANSFORMERS_MODEL,
     PYTORCH_STABLE_DIFFUSION_MODEL,
@@ -295,7 +296,7 @@ def _onnx_export(
 
     def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"):
         pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device)
-        models_and_onnx_configs = get_stable_diffusion_models_for_export(pipeline)
+        models_and_onnx_configs = get_diffusion_models_for_export(pipeline)
         output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
         model, _ = models_and_onnx_configs["vae_encoder"]
         model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters}
@@ -400,14 +401,14 @@ def test_tensorflow_export(
 
         self._onnx_export(test_name, model_type, model_name, task, onnx_config_class_constructor, monolith=monolith)
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch
     @require_vision
     @require_diffusers
-    def test_pytorch_export_for_stable_diffusion_models(self, model_type, model_name):
+    def test_pytorch_export_for_diffusion_models(self, model_type, model_name):
         self._onnx_export_sd(model_type, model_name)
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch
     @require_vision
     @require_diffusers
@@ -415,7 +416,7 @@ def test_pytorch_export_for_stable_diffusion_models(self, model_type, model_name
     @slow
     @pytest.mark.run_slow
     @pytest.mark.gpu_test
-    def test_pytorch_export_for_stable_diffusion_models_cuda(self, model_type, model_name):
+    def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name):
         self._onnx_export_sd(model_type, model_name, device="cuda")
 
 
diff --git a/tests/gptq/Dockerfile_quantization_gpu b/tests/gptq/Dockerfile_quantization_gpu
deleted file mode 100644
index 34a2a135529..00000000000
--- a/tests/gptq/Dockerfile_quantization_gpu
+++ /dev/null
@@ -1,26 +0,0 @@
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
-CMD nvidia-smi
-
-# Ignore interactive questions during `docker build`
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install and update tools to minimize security vulnerabilities
-RUN apt-get update
-RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev python3-pip && \
-    apt-get clean
-RUN unattended-upgrade
-RUN apt-get autoremove -y
-
-RUN python3 -m pip install -U pip
-
-RUN pip install torch torchvision torchaudio
-RUN pip install transformers accelerate auto-gptq datasets
-
-# Install Optimum
-COPY . /workspace/optimum
-RUN pip install /workspace/optimum[tests]
-
-ENV RUN_SLOW=1
-WORKDIR /workspace/optimum/tests/
-CMD pytest gptq/test_*.py --durations=0 -s -vvvvv
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 5ed1619fde3..220d0235860 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -23,12 +23,19 @@
 
 from optimum.gptq import GPTQQuantizer, load_quantized_model
 from optimum.gptq.data import get_dataset
-from optimum.utils.import_utils import is_auto_gptq_available
-from optimum.utils.testing_utils import require_accelerate, require_auto_gptq, require_torch_gpu
+from optimum.gptq.eval import evaluate_perplexity
+from optimum.gptq.utils import get_block_name_with_pattern, get_preceding_modules, get_seqlen
+from optimum.utils import recurse_getattr
+from optimum.utils.import_utils import is_accelerate_available, is_auto_gptq_available
+from optimum.utils.testing_utils import require_auto_gptq, require_torch_gpu
 
 
 if is_auto_gptq_available():
     from auto_gptq import AutoGPTQForCausalLM
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
 
 
 @slow
@@ -37,15 +44,10 @@
 class GPTQTest(unittest.TestCase):
     model_name = "bigscience/bloom-560m"
 
-    input_text = "Hello my name is"
-    EXPECTED_OUTPUTS = set()
-    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
-    EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
+    expected_fp16_perplexity = 30
+    expected_quantized_perplexity = 34
 
-    # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
-    EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
+    expected_compression_ratio = 1.66
 
     bits = 4
     group_size = 128
@@ -53,24 +55,30 @@ class GPTQTest(unittest.TestCase):
     disable_exllama = True
     exllama_config = None
     cache_block_outputs = True
-    modules_to_quantize_inside_block = None
+    modules_in_block_to_quantize = None
     device_map_for_quantization = "cuda"
+    device_for_inference = 0
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
     ]
 
-    # called only once for all test in this class
+    # called only once for all tests in this class
     @classmethod
     def setUpClass(cls):
         """
         Setup quantized model
         """
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+
         cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
             cls.model_name, torch_dtype=torch.float16, device_map=cls.device_map_for_quantization
         )
-        cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
+        cls.fp16_mem = cls.model_fp16.get_memory_footprint()
+
+        if cls.device_map_for_quantization != "cpu":
+            cls.fp16_ppl = evaluate_perplexity(cls.model_fp16, cls.tokenizer)
 
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
         cls.quantizer = GPTQQuantizer(
             bits=cls.bits,
             dataset=cls.dataset,
@@ -79,10 +87,13 @@ def setUpClass(cls):
             disable_exllama=cls.disable_exllama,
             exllama_config=cls.exllama_config,
             cache_block_outputs=cls.cache_block_outputs,
-            modules_to_quantize_inside_block=cls.modules_to_quantize_inside_block,
+            modules_in_block_to_quantize=cls.modules_in_block_to_quantize,
         )
+        cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer).to(cls.device_for_inference)
+        cls.quantized_mem = cls.quantized_model.get_memory_footprint()
 
-        cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
+        if cls.device_map_for_quantization != "cpu":
+            cls.quantized_ppl = evaluate_perplexity(cls.quantized_model, cls.tokenizer)
 
     def test_memory_footprint(self):
         """
@@ -90,19 +101,26 @@ def test_memory_footprint(self):
         memory footprint of the converted model and the class type of the linear layers of the converted models
         """
 
-        mem_quantized = self.quantized_model.get_memory_footprint()
+        self.assertAlmostEqual(self.fp16_mem / self.quantized_mem, self.expected_compression_ratio, places=2)
 
-        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
+    def test_perplexity(self):
+        """
+        A simple test to check if the model conversion has been done correctly by checking on the
+        the perplexity of the converted models
+        """
+
+        self.assertEqual(int(self.fp16_ppl), self.expected_fp16_perplexity)
+        self.assertEqual(int(self.quantized_ppl), self.expected_quantized_perplexity)
 
     def test_quantized_layers_class(self):
         """
         A simple test to check if the model conversion has been done correctly by checking on the
         the class type of the linear layers of the converted models
         """
-        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
 
         QuantLinear = dynamically_import_QuantLinear(
             use_triton=False,
+            use_qigen=False,
             desc_act=self.desc_act,
             group_size=self.group_size,
             bits=self.bits,
@@ -114,32 +132,10 @@ def test_quantized_layers_class(self):
     def check_quantized_layers_type(self, model, value):
         self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value)
 
-    def check_inference_correctness(self, model):
-        """
-        Test the generation quality of the quantized model and see that we are matching the expected output.
-        Given that we are operating on small numbers + the testing model is relatively small, we might not get
-        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
-        """
-        # Check that inference pass works on the model
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-
-        # Get the generation
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
-
-        # Check the exactness of the result
-        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
-
-    def test_generate_quality(self):
-        self.check_inference_correctness(self.quantized_model)
-
-    @require_torch_gpu
-    @require_accelerate
-    @slow
     def test_serialization(self):
         """
         Test the serialization of the model and the loading of the quantized weights
         """
-        from accelerate import init_empty_weights
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantizer.save(self.quantized_model, tmpdirname)
@@ -152,7 +148,7 @@ def test_serialization(self):
             quantized_model_from_saved = load_quantized_model(
                 empty_model,
                 save_folder=tmpdirname,
-                device_map={"": 0},
+                device_map={"": self.device_for_inference},
                 disable_exllama=self.disable_exllama,
                 exllama_config=self.exllama_config,
             )
@@ -161,54 +157,37 @@ def test_serialization(self):
             else:
                 self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
 
-            with torch.device("cuda"):
-                _ = AutoModelForCausalLM.from_pretrained(tmpdirname)
-            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname)
-
-            self.check_inference_correctness(quantized_model_from_saved)
+            # transformers and auto-gptq compatibility
+            # quantized models are more compatible with device map than
+            # device context managers (they're never used in transformers testing suite)
+            _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference})
+            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference})
 
 
 class GPTQTestCPUInit(GPTQTest):
     device_map_for_quantization = "cpu"
 
-    def test_generate_quality(self):
-        self.check_inference_correctness(self.quantized_model.to(0))
+    def test_perplexity(self):
+        pass
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
     exllama_config = {"version": 1}
-    EXPECTED_OUTPUTS = set()
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
-    EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
-    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
 
 
 class GPTQTestActOrder(GPTQTest):
-    EXPECTED_OUTPUTS = set()
-    EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
-    EXPECTED_OUTPUTS.add("Hello my name is jessie and i am a very sweet and")
-    EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
-    EXPECTED_OUTPUTS.add("Hello my name is\nI am a student of the University of the'")
-
     disable_exllama = True
     desc_act = True
 
-    def test_generate_quality(self):
-        # act_order don't work with qlinear_cuda kernel
-        pass
-
     def test_serialization(self):
         # act_order don't work with qlinear_cuda kernel
         pass
 
-    @require_torch_gpu
     def test_exllama_serialization(self):
         """
         Test the serialization of the model and the loading of the quantized weights with exllama kernel
         """
-        from accelerate import init_empty_weights
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantizer.save(self.quantized_model, tmpdirname)
@@ -219,21 +198,23 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1}
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": self.device_for_inference},
+                exllama_config={"version": 1},
             )
             self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
 
-            with torch.device("cuda"):
-                _ = AutoModelForCausalLM.from_pretrained(tmpdirname)
-            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname)
-
-            self.check_inference_correctness(quantized_model_from_saved)
+            # transformers and auto-gptq compatibility
+            # quantized models are more compatible with device map than
+            # device context managers (they're never used in transformers testing suite)
+            _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference})
+            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference})
 
     def test_exllama_max_input_length(self):
         """
         Test if the max_input_length works with exllama + act_order
         """
-        from accelerate import init_empty_weights
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantizer.save(self.quantized_model, tmpdirname)
@@ -246,9 +227,9 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved = load_quantized_model(
                 empty_model,
                 save_folder=tmpdirname,
-                device_map={"": 0},
-                max_input_length=4028,
+                device_map={"": self.device_for_inference},
                 exllama_config={"version": 1},
+                max_input_length=4028,
             )
             self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
 
@@ -268,26 +249,16 @@ def test_exllama_max_input_length(self):
 class GPTQTestExllamav2(GPTQTest):
     desc_act = False
     disable_exllama = True
-    EXPECTED_OUTPUTS = set()
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
-    EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
-    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
-
-    def test_generate_quality(self):
-        # don't need to test
-        pass
+    exllama_config = {"version": 2}
 
     def test_serialization(self):
         # don't need to test
         pass
 
-    @require_torch_gpu
     def test_exllama_serialization(self):
         """
         Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel
         """
-        from accelerate import init_empty_weights
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantizer.save(self.quantized_model, tmpdirname)
@@ -300,24 +271,19 @@ def test_exllama_serialization(self):
             quantized_model_from_saved = load_quantized_model(
                 empty_model,
                 save_folder=tmpdirname,
-                device_map={"": 0},
+                device_map={"": self.device_for_inference},
             )
             self.check_quantized_layers_type(quantized_model_from_saved, "exllamav2")
 
-            with torch.device("cuda"):
-                _ = AutoModelForCausalLM.from_pretrained(tmpdirname)
-            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname)
-
-            self.check_inference_correctness(quantized_model_from_saved)
+            # transformers and auto-gptq compatibility
+            # quantized models are more compatible with device map than
+            # device context managers (they're never used in transformers testing suite)
+            _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference})
+            _ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference})
 
 
 class GPTQTestNoBlockCaching(GPTQTest):
     cache_block_outputs = False
-    EXPECTED_OUTPUTS = set()
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
-    EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
-    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
-    EXPECTED_OUTPUTS.add("Hello my name is Aiden and I am a very good looking")
 
 
 class GPTQTestModuleQuant(GPTQTest):
@@ -327,7 +293,7 @@ class GPTQTestModuleQuant(GPTQTest):
         ["mlp.dense_h_to_4h"],
         ["mlp.dense_4h_to_h"],
     ]
-    EXPECTED_RELATIVE_DIFFERENCE = 1.57705236164535
+    expected_compression_ratio = 1.577
 
     def test_not_converted_layers(self):
         # self_attention.dense should not be converted
@@ -350,16 +316,11 @@ class GPTQUtilsTest(unittest.TestCase):
     ]
 
     def test_get_seqlen(self):
-        from optimum.gptq.utils import get_seqlen
-
         model = AutoModelForCausalLM.from_pretrained(self.model_name)
         seqlen = get_seqlen(model)
         self.assertEqual(seqlen, self.expected_seqlen)
 
     def test_get_block_name(self):
-        from optimum.gptq.utils import get_block_name_with_pattern
-        from optimum.utils import recurse_getattr
-
         model = AutoModelForCausalLM.from_pretrained(self.model_name)
         block_name = get_block_name_with_pattern(model)
         self.assertEqual(block_name, self.expected_block_name)
@@ -367,8 +328,6 @@ def test_get_block_name(self):
         self.assertEqual(block_class_name, self.expected_block_name_class)
 
     def test_get_preceding_modules(self):
-        from optimum.gptq.utils import get_preceding_modules
-
         model = AutoModelForCausalLM.from_pretrained(self.model_name)
         modules_names = get_preceding_modules(model, self.expected_block_name)
         self.assertCountEqual(modules_names, self.expected_preceding_modules)
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index ef9168f0409..e66047fbdd3 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2737,6 +2737,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin):
         "resnet",
         "segformer",
         "swin",
+        "swin-window",
         "vit",
     ]
 
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 65298265780..bb6935461d7 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -143,6 +143,7 @@
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "trocr": "microsoft/trocr-small-handwritten",