diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 5a21f12d015..9aa8b307235 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install .[tests,onnxruntime,exporters-tf]
+        pip install .[tests,exporters]
     - name: Test with unittest
       working-directory: tests
       run: |
diff --git a/README.md b/README.md
index c892a142994..9a6403cdacb 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,7 @@ It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO
 optimum-cli export openvino --model distilbert-base-uncased-finetuned-sst-2-english distilbert_sst2_ov
 ```
 
-If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#weight-only-quantization) for more detail on weight only quantization. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#static-quantization).
-
+If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/export) for more detail. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/intel/openvino/optimization#static-quantization).
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -92,13 +91,13 @@ To load a model and run inference with OpenVINO Runtime, you can just replace yo
   model_id = "distilbert-base-uncased-finetuned-sst-2-english"
   tokenizer = AutoTokenizer.from_pretrained(model_id)
 - model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained("distilbert_sst2_ov")
++ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 
   classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
   results = classifier("He's a dreadful magician.")
 ```
 
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
+You can find more examples in the [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
 
 ### Neural Compressor
 
diff --git a/examples/onnxruntime/optimization/multiple-choice/run_swag.py b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
index 3c43846b9a5..bcddc975907 100644
--- a/examples/onnxruntime/optimization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -236,7 +236,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -254,13 +253,18 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForMultipleChoice.from_pretrained(
+        optimized_model_path,
+        provider=optim_args.execution_provider,
+    )
+
     if training_args.do_eval:
         # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
         # prediction step(s)
@@ -339,13 +343,12 @@ def compute_metrics(eval_predictions):
         # Evaluation
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/optimization/question-answering/run_qa.py b/examples/onnxruntime/optimization/question-answering/run_qa.py
index 04a9bd34f36..407714cb01f 100644
--- a/examples/onnxruntime/optimization/question-answering/run_qa.py
+++ b/examples/onnxruntime/optimization/question-answering/run_qa.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -305,7 +305,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -323,13 +322,15 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForQuestionAnswering.from_pretrained(optimized_model_path, provider=optim_args.execution_provider)
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -478,13 +479,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -514,12 +514,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/optimization/text-classification/README.md b/examples/onnxruntime/optimization/text-classification/README.md
index 42a99cc73d3..3a7dce2b59f 100644
--- a/examples/onnxruntime/optimization/text-classification/README.md
+++ b/examples/onnxruntime/optimization/text-classification/README.md
@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Text classification 
+# Text classification
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py)
-allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies graph optimization on a DistilBERT fine-tuned on the sst-2 task. Here the optimization level is selected to be 1, enabling basic optimizations such as redundant node eliminations and constant folding. Higher optimization level will result in hardware dependent optimized graph.
 
diff --git a/examples/onnxruntime/optimization/text-classification/run_glue.py b/examples/onnxruntime/optimization/text-classification/run_glue.py
index a07193915b8..222dda15074 100644
--- a/examples/onnxruntime/optimization/text-classification/run_glue.py
+++ b/examples/onnxruntime/optimization/text-classification/run_glue.py
@@ -42,7 +42,7 @@
 
 from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -250,7 +250,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
 
@@ -268,13 +267,17 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForSequenceClassification.from_pretrained(
+        optimized_model_path, provider=optim_args.execution_provider
+    )
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -408,13 +411,13 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the evaluation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -436,10 +439,12 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the test dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path, execution_provider=optim_args.execution_provider, label_names=["label"]
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/optimization/token-classification/run_ner.py b/examples/onnxruntime/optimization/token-classification/run_ner.py
index 73db3671d2f..2e7b63792c3 100644
--- a/examples/onnxruntime/optimization/token-classification/run_ner.py
+++ b/examples/onnxruntime/optimization/token-classification/run_ner.py
@@ -38,7 +38,7 @@
 
 from optimum.onnxruntime import ORTModelForTokenClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -276,7 +276,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -480,12 +479,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the validation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -509,12 +507,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the prediction dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/examples/onnxruntime/quantization/image-classification/run_image_classification.py b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
index 3d0fa72882e..6feaaef4f3b 100644
--- a/examples/onnxruntime/quantization/image-classification/run_image_classification.py
+++ b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
@@ -22,7 +22,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -38,7 +37,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForImageClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -47,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 logger = logging.getLogger(__name__)
@@ -378,13 +377,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForImageClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -409,13 +411,12 @@ def compute_metrics(p: EvalPrediction):
         # Set the validation transforms
         eval_dataset = eval_dataset.with_transform(preprocess_function)
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=[labels_column],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
diff --git a/examples/onnxruntime/quantization/multiple-choice/run_swag.py b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
index 9d9642c12d5..9a8423f836d 100644
--- a/examples/onnxruntime/quantization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
@@ -38,7 +38,6 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
     ExcludeGeLUNodes,
@@ -46,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -409,13 +409,14 @@ def compute_metrics(eval_predictions):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForMultipleChoice.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
@@ -436,13 +437,12 @@ def compute_metrics(eval_predictions):
                 load_from_cache_file=not data_args.overwrite_cache,
             )
 
-        ort_model = ORTModel(
-            os.path.join(training_args.output_dir, "model_quantized.onnx"),
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/quantization/question-answering/README.md b/examples/onnxruntime/quantization/question-answering/README.md
index 380afff8cad..8345ca8e4d0 100644
--- a/examples/onnxruntime/quantization/question-answering/README.md
+++ b/examples/onnxruntime/quantization/question-answering/README.md
@@ -16,13 +16,9 @@ limitations under the License.
 
 # Question answering
 
+The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
 
-The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
-
-Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along 
-the flag `--version_2_with_negative`.
+Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the SQuAD1.0 dataset.
 
diff --git a/examples/onnxruntime/quantization/question-answering/run_qa.py b/examples/onnxruntime/quantization/question-answering/run_qa.py
index 4a6a854fd97..50661b7b420 100644
--- a/examples/onnxruntime/quantization/question-answering/run_qa.py
+++ b/examples/onnxruntime/quantization/question-answering/run_qa.py
@@ -24,7 +24,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -39,7 +38,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForQuestionAnswering
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -48,6 +46,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -651,25 +650,25 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForQuestionAnswering.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -681,12 +680,11 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/quantization/text-classification/README.md b/examples/onnxruntime/quantization/text-classification/README.md
index 460bb56fba8..95fd3335171 100644
--- a/examples/onnxruntime/quantization/text-classification/README.md
+++ b/examples/onnxruntime/quantization/text-classification/README.md
@@ -18,10 +18,7 @@ limitations under the License.
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the sst-2 task.
 
diff --git a/examples/onnxruntime/quantization/text-classification/run_glue.py b/examples/onnxruntime/quantization/text-classification/run_glue.py
index bc141b2194f..4b9ee0403c3 100644
--- a/examples/onnxruntime/quantization/text-classification/run_glue.py
+++ b/examples/onnxruntime/quantization/text-classification/run_glue.py
@@ -23,7 +23,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -44,7 +43,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForSequenceClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -53,6 +51,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -476,13 +475,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForSequenceClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -504,13 +506,13 @@ def compute_metrics(p: EvalPrediction):
                 f" Evaluation results may suffer from a wrong matching."
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -525,12 +527,11 @@ def compute_metrics(p: EvalPrediction):
         if data_args.max_predict_samples is not None:
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/quantization/token-classification/README.md b/examples/onnxruntime/quantization/token-classification/README.md
index f56388ed3c0..540b3cbe2dd 100644
--- a/examples/onnxruntime/quantization/token-classification/README.md
+++ b/examples/onnxruntime/quantization/token-classification/README.md
@@ -16,10 +16,7 @@ limitations under the License.
 
 # Token classification
 
-
-The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks. 
+The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the CoNLL-2003 task
 
diff --git a/examples/onnxruntime/quantization/token-classification/run_ner.py b/examples/onnxruntime/quantization/token-classification/run_ner.py
index 1cc12d3fbc0..3a5798c57a8 100644
--- a/examples/onnxruntime/quantization/token-classification/run_ner.py
+++ b/examples/onnxruntime/quantization/token-classification/run_ner.py
@@ -25,7 +25,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -40,7 +39,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForTokenClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -49,6 +47,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -551,13 +550,16 @@ def compute_metrics(p):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForTokenClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -572,12 +574,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the validation dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -602,12 +603,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the prediction dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index e23716d4b74..c66e54b323c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -72,6 +72,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     FalconModelPatcher,
+    MistralModelPatcher,
     MusicgenModelPatcher,
     SAMModelPatcher,
     SentenceTransformersCLIPPatcher,
@@ -237,7 +238,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class GPT2OnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head")
 
 
@@ -259,7 +260,7 @@ class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
 
 
 class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
@@ -312,6 +313,11 @@ class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
@@ -600,7 +606,7 @@ def inputs_for_default_and_seq2seq_lm(self):
     def inputs_for_causal_lm(self):
         if self.use_past_in_inputs:
             common_inputs = {
-                "input_ids": {0: "batch_size"},
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "attention_mask": {0: "batch_size", 1: "past_sequence_length + 1"},
             }
             for i in range(self._normalized_config.decoder_num_layers):
@@ -645,7 +651,11 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs = super(OnnxConfigWithPast, self).outputs
             if self.use_past:
                 # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
-                for i in range(self._normalized_config.encoder_num_layers):
+                for i in range(
+                    self._normalized_config.encoder_num_layers
+                    if self.task != "text-generation"
+                    else self._normalized_config.decoder_num_layers
+                ):
                     common_outputs[f"present.{i}.key"] = {0: "batch_size", 2: "past_sequence_length + sequence_length"}
                     common_outputs[f"present.{i}.value"] = {
                         0: "batch_size",
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 0a105343546..5e720d0cd7d 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -42,6 +42,9 @@
     _prepare_4d_causal_attention_mask_for_sdpa = None
     AttentionMaskConverter = None
 
+if _transformers_version >= version.parse("4.42"):
+    from transformers.cache_utils import SlidingWindowCache, StaticCache
+
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, TFPreTrainedModel
 
@@ -746,6 +749,20 @@ def patched_forward(
 
 
 class SentenceTransformersTransformerPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                _update_causal_mask_patched, self._model[0].auto_model
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                self._update_causal_mask_original, self._model[0].auto_model
+            )
+
     def __init__(
         self,
         config: "OnnxConfig",
@@ -754,6 +771,9 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._update_causal_mask_original = self._model[0].auto_model._update_causal_mask
+
         def patched_forward(input_ids, attention_mask):
             result = self.orig_forward({"input_ids": input_ids, "attention_mask": attention_mask})
 
@@ -931,3 +951,182 @@ def patched_forward(
                 return {"audio_values": audio_values}
 
             self.patched_forward = patched_forward
+
+
+def _update_causal_mask_patched(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values,
+    use_cache: bool,
+    output_attentions: bool,
+):
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+
+    # cache_position must be valid here no matter which cache we use
+    past_seen_tokens = cache_position[0] if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+    using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and not (using_static_cache or using_sliding_window_cache)
+        and not output_attentions
+    ):
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            sliding_window=self.config.sliding_window,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    min_dtype = torch.finfo(dtype).min
+    sequence_length = input_tensor.shape[1]
+    # SlidingWindowCache
+    if using_sliding_window_cache:
+        target_length = max(sequence_length, self.config.sliding_window)
+    # StaticCache
+    elif using_static_cache:
+        target_length = past_key_values.get_max_length()
+    # DynamicCache or no cache
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        if self.config.sliding_window is not None:
+            if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
+                # ---------------- NOTE: This part is patched -----------------------------
+                exclude_mask = torch.bitwise_or(
+                    exclude_mask,
+                    torch.arange(target_length, device=device)
+                    <= (cache_position.reshape(-1, 1) - self.config.sliding_window),
+                )
+                # ---------------- NOTE: patch end ----------------------------------------
+
+        causal_mask *= exclude_mask
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+    # if (
+    #     self.config._attn_implementation == "sdpa"
+    #     and attention_mask is not None
+    #     and attention_mask.device.type == "cuda"
+    #     and not output_attentions
+    # ):
+    #     # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+    #     # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+    #     # Details: https://github.com/pytorch/pytorch/issues/110213
+    #     causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+class MistralModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = _make_causal_mask_patched_staticmethod
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = _unmask_unattended_patched_staticmethod
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", _prepare_4d_causal_attention_mask_for_sdpa_patched
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    _update_causal_mask_patched, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(_update_causal_mask_patched, self._model)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = staticmethod(self.original_make_causal)
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = staticmethod(self.original_unmask_unattended)
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", self.original_prepare_4d_causal_attention_mask_for_sdpa
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    self._update_causal_mask_original, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(self._update_causal_mask_original, self._model)
+
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if _transformers_version >= version.parse("4.36"):
+            self.original_prepare_4d_causal_attention_mask_for_sdpa = _prepare_4d_causal_attention_mask_for_sdpa
+            self.original_unmask_unattended = AttentionMaskConverter._unmask_unattended
+
+        # TODO: Remove this if once transformers if much above 4.35
+        if AttentionMaskConverter is not None:
+            self.original_make_causal = AttentionMaskConverter._make_causal_mask
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._update_causal_mask_original = self._model.model._update_causal_mask
+            else:
+                self._update_causal_mask_original = self._model._update_causal_mask
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 74b05d5b151..6f3f641b439 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -415,13 +415,6 @@ def from_pretrained(
                 trust_remote_code=trust_remote_code,
             )
 
-        if not export and trust_remote_code:
-            logger.warning(
-                "The argument `trust_remote_code` is to be used along with export=True. It will be ignored."
-            )
-        elif export and trust_remote_code is None:
-            trust_remote_code = False
-
         from_pretrained_method = cls._from_transformers if export else cls._from_pretrained
 
         return from_pretrained_method(
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index 23ca6e5e6a6..caa662f3824 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -49,6 +49,11 @@ def __init__(
             label_names (`List[str]`, `optional`):
                 The list of keys in your dictionary of inputs that correspond to the labels.
         """
+
+        logger.warning(
+            "The class `optimum.onnxruntime.model.ORTModel` is deprecated and will be removed in the next release."
+        )
+
         self.compute_metrics = compute_metrics
         self.label_names = ["labels"] if label_names is None else label_names
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index fd7e741d7c0..6a0dcbba2f0 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -121,6 +121,7 @@ class ORTModelForCausalLM(ORTModel, GenerationMixin):
 
     auto_model_class = AutoModelForCausalLM
     main_input_name = "input_ids"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index b3bad65954d..126b1e65366 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1091,10 +1091,11 @@ def forward(
             onnx_outputs = self.model.run(None, onnx_inputs)
             model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
-            # TODO: why do we only return last_hidden_state? why not all outputs?
-            # that way, there will be less need for ORTModelForCustomTask in cases where
-            # we just want to extend model outputs with attentions, hidden_states, etc.
-            last_hidden_state = model_outputs["last_hidden_state"]
+            if "last_hidden_state" in self.output_names:
+                last_hidden_state = model_outputs["last_hidden_state"]
+            else:
+                # TODO: This allows to support sentence-transformers models (sentence embedding), but is not validated.
+                last_hidden_state = next(iter(model_outputs.values()))
 
         # converts output to namedtuple for pipelines post-processing
         return BaseModelOutput(last_hidden_state=last_hidden_state)
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 89a0ae44d58..3b1af05d0f5 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -570,6 +570,7 @@ class ORTModelForConditionalGeneration(ORTModel, ABC):
 
     # Used in from_transformers to export model to onnxORTEncoder
     base_model_prefix = "onnx_model"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 37d0feefcc4..ad40af92b9d 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -17,11 +17,15 @@
 import re
 from enum import Enum
 from inspect import signature
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from packaging import version
+from tqdm import tqdm
+from transformers import EvalPrediction
+from transformers.trainer_pt_utils import nested_concat
+from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import logging
 
 import onnxruntime as ort
@@ -30,6 +34,12 @@
 from ..utils.import_utils import _is_package_available
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+    from .modeling_ort import ORTModel
+
+
 logger = logging.get_logger(__name__)
 
 ONNX_WEIGHTS_NAME = "model.onnx"
@@ -341,3 +351,53 @@ class ORTQuantizableOperator(Enum):
     Resize = "Resize"
     AveragePool = "AveragePool"
     Concat = "Concat"
+
+
+def evaluation_loop(
+    model: "ORTModel",
+    dataset: "Dataset",
+    label_names: Optional[List[str]] = None,
+    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+):
+    """
+    Run evaluation and returns metrics and predictions.
+
+    Args:
+        model (`ORTModel`):
+            The ONNXRuntime model to use for the evaluation step.
+        dataset (`datasets.Dataset`):
+            Dataset to use for the evaluation step.
+        label_names (`List[str]`, `optional`):
+            The list of keys in your dictionary of inputs that correspond to the labels.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, `optional`):
+            The function that will be used to compute metrics at evaluation. Must take an `EvalPrediction` and
+            return a dictionary string to metric values.
+    """
+
+    all_preds = None
+    all_labels = None
+
+    for inputs in tqdm(dataset, desc="Evaluation"):
+        has_labels = all(inputs.get(k) is not None for k in label_names)
+        if has_labels:
+            labels = tuple(np.array([inputs.get(name)]) for name in label_names)
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        inputs = {key: np.array([inputs[key]]) for key in model.input_names if key in inputs}
+        preds = model(**inputs)
+
+        if len(preds) == 1:
+            preds = preds[0]
+
+        all_preds = preds if all_preds is None else nested_concat(all_preds, preds, padding_index=-100)
+        all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+    if compute_metrics is not None and all_preds is not None and all_labels is not None:
+        metrics = compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+    else:
+        metrics = {}
+
+    return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))
diff --git a/optimum/version.py b/optimum/version.py
index 6deb421ee56..8eeeb9d05a7 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.21.0.dev0"
+__version__ = "1.22.0.dev0"
diff --git a/setup.py b/setup.py
index b6a5b07f932..41598aeba5f 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.42.0",
+    "transformers[sentencepiece]>=4.26.0,<4.43.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
@@ -75,11 +75,12 @@
         "transformers[sentencepiece]>=4.26.0,<4.38.0",
     ],
     "diffusers": ["diffusers"],
-    "intel": "optimum-intel>=1.16.0",
-    "openvino": "optimum-intel[openvino]>=1.16.0",
-    "nncf": "optimum-intel[nncf]>=1.16.0",
-    "neural-compressor": "optimum-intel[neural-compressor]>=1.16.0",
-    "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"],
+    "intel": "optimum-intel>=1.18.0",
+    "openvino": "optimum-intel[openvino]>=1.18.0",
+    "nncf": "optimum-intel[nncf]>=1.18.0",
+    "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
+    "ipex": "optimum-intel[ipex]>=1.18.0",
+    "habana": ["optimum-habana", "transformers >= 4.40.0, < 4.41.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "graphcore": "optimum-graphcore",