From d82d4c656ed80da6684cd4d3766edfda8e7a1705 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 1 Jul 2024 13:06:29 +0400
Subject: [PATCH 01/12] Fix incorrect names for usage blenderbot for causallm
 (#1887)

* Fix incorrect names for usage blenderbot for causallm

* fix input dynamic shapes as dummy input seq len  != 1

* apply code style
---
 optimum/exporters/onnx/model_configs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index e23716d4b74..421b7c9010a 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -600,7 +600,7 @@ def inputs_for_default_and_seq2seq_lm(self):
     def inputs_for_causal_lm(self):
         if self.use_past_in_inputs:
             common_inputs = {
-                "input_ids": {0: "batch_size"},
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "attention_mask": {0: "batch_size", 1: "past_sequence_length + 1"},
             }
             for i in range(self._normalized_config.decoder_num_layers):
@@ -645,7 +645,11 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             common_outputs = super(OnnxConfigWithPast, self).outputs
             if self.use_past:
                 # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
-                for i in range(self._normalized_config.encoder_num_layers):
+                for i in range(
+                    self._normalized_config.encoder_num_layers
+                    if self.task != "text-generation"
+                    else self._normalized_config.decoder_num_layers
+                ):
                     common_outputs[f"present.{i}.key"] = {0: "batch_size", 2: "past_sequence_length + sequence_length"}
                     common_outputs[f"present.{i}.value"] = {
                         0: "batch_size",

From a5500c7e5047ec43e73925a01a1e98b72e64b0d3 Mon Sep 17 00:00:00 2001
From: Satish Silveri <35198773+satishsilveri@users.noreply.github.com>
Date: Mon, 1 Jul 2024 08:22:07 -0400
Subject: [PATCH 02/12] Fixed bug key error "last_hidden_state" (#1674)

* bug fix: key error last_hidden_state

* fix

---------

Co-authored-by: Satish Silveri <satishsilveri@Satishs-MacBook-Air.local>
Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
---
 optimum/onnxruntime/modeling_ort.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index b3bad65954d..bfdfbff1b11 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1091,10 +1091,10 @@ def forward(
             onnx_outputs = self.model.run(None, onnx_inputs)
             model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
-            # TODO: why do we only return last_hidden_state? why not all outputs?
-            # that way, there will be less need for ORTModelForCustomTask in cases where
-            # we just want to extend model outputs with attentions, hidden_states, etc.
-            last_hidden_state = model_outputs["last_hidden_state"]
+        if "last_hidden_state" in self.output_names:
+            last_hidden_state = model_outputs[self.output_names["last_hidden_state"]]
+        else:
+            last_hidden_state = model_outputs[0]
 
         # converts output to namedtuple for pipelines post-processing
         return BaseModelOutput(last_hidden_state=last_hidden_state)

From 86adc3e50a2bed04c8ecf86e1eba170b451e4afd Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 1 Jul 2024 18:03:42 +0200
Subject: [PATCH 03/12] Support transformers 4.42 (#1929)

* support transformers 4.42

* fix mistral

* update opsets

* fix _supports_cache_class

* typo

* nit

* remove onnxruntime extra in ci

* fix
---
 .github/workflows/test_onnx.yml         |   2 +-
 optimum/exporters/onnx/model_configs.py |  10 +-
 optimum/exporters/onnx/model_patcher.py | 197 ++++++++++++++++++++++++
 optimum/onnxruntime/modeling_decoder.py |   1 +
 optimum/onnxruntime/modeling_ort.py     |   5 +-
 optimum/onnxruntime/modeling_seq2seq.py |   1 +
 setup.py                                |   2 +-
 7 files changed, 212 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 5a21f12d015..9aa8b307235 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install .[tests,onnxruntime,exporters-tf]
+        pip install .[tests,exporters]
     - name: Test with unittest
       working-directory: tests
       run: |
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 421b7c9010a..c66e54b323c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -72,6 +72,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     FalconModelPatcher,
+    MistralModelPatcher,
     MusicgenModelPatcher,
     SAMModelPatcher,
     SentenceTransformersCLIPPatcher,
@@ -237,7 +238,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class GPT2OnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head")
 
 
@@ -259,7 +260,7 @@ class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
 
 
 class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
@@ -312,6 +313,11 @@ class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 0a105343546..215d65549f8 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -42,6 +42,9 @@
     _prepare_4d_causal_attention_mask_for_sdpa = None
     AttentionMaskConverter = None
 
+if _transformers_version >= version.parse("4.42"):
+    from transformers.cache_utils import SlidingWindowCache, StaticCache
+
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, TFPreTrainedModel
 
@@ -746,6 +749,20 @@ def patched_forward(
 
 
 class SentenceTransformersTransformerPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                _update_causal_mask_patched, self._model[0].auto_model
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._model[0].auto_model._update_causal_mask = types.MethodType(
+                self._update_causal_mask_original, self._model[0].auto_model
+            )
+
     def __init__(
         self,
         config: "OnnxConfig",
@@ -754,6 +771,8 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
+        self._update_causal_mask_original = self._model[0].auto_model._update_causal_mask
+
         def patched_forward(input_ids, attention_mask):
             result = self.orig_forward({"input_ids": input_ids, "attention_mask": attention_mask})
 
@@ -931,3 +950,181 @@ def patched_forward(
                 return {"audio_values": audio_values}
 
             self.patched_forward = patched_forward
+
+
+def _update_causal_mask_patched(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values,
+    use_cache: bool,
+    output_attentions: bool,
+):
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+
+    # cache_position must be valid here no matter which cache we use
+    past_seen_tokens = cache_position[0] if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+    using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and not (using_static_cache or using_sliding_window_cache)
+        and not output_attentions
+    ):
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            sliding_window=self.config.sliding_window,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    min_dtype = torch.finfo(dtype).min
+    sequence_length = input_tensor.shape[1]
+    # SlidingWindowCache
+    if using_sliding_window_cache:
+        target_length = max(sequence_length, self.config.sliding_window)
+    # StaticCache
+    elif using_static_cache:
+        target_length = past_key_values.get_max_length()
+    # DynamicCache or no cache
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        if self.config.sliding_window is not None:
+            if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
+                # ---------------- NOTE: This part is patched -----------------------------
+                exclude_mask.bitwise_or_(
+                    torch.arange(target_length, device=device)
+                    <= (cache_position.reshape(-1, 1) - self.config.sliding_window)
+                )
+                # ---------------- NOTE: patch end ----------------------------------------
+
+        causal_mask *= exclude_mask
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+    # if (
+    #     self.config._attn_implementation == "sdpa"
+    #     and attention_mask is not None
+    #     and attention_mask.device.type == "cuda"
+    #     and not output_attentions
+    # ):
+    #     # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+    #     # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+    #     # Details: https://github.com/pytorch/pytorch/issues/110213
+    #     causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+class MistralModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = _make_causal_mask_patched_staticmethod
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = _unmask_unattended_patched_staticmethod
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", _prepare_4d_causal_attention_mask_for_sdpa_patched
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    _update_causal_mask_patched, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(_update_causal_mask_patched, self._model)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if AttentionMaskConverter is not None:
+            # TODO: Remove this _make_causal_mask patch if once transformers if much above 4.35
+            AttentionMaskConverter._make_causal_mask = staticmethod(self.original_make_causal)
+
+            if _transformers_version >= version.parse("4.36"):
+                AttentionMaskConverter._unmask_unattended = staticmethod(self.original_unmask_unattended)
+
+        if _transformers_version >= version.parse("4.36"):
+            patch_everywhere(
+                "_prepare_4d_causal_attention_mask_for_sdpa", self.original_prepare_4d_causal_attention_mask_for_sdpa
+            )
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._model.model._update_causal_mask = types.MethodType(
+                    self._update_causal_mask_original, self._model.model
+                )
+            else:
+                self._model._update_causal_mask = types.MethodType(self._update_causal_mask_original, self._model)
+
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if _transformers_version >= version.parse("4.36"):
+            self.original_prepare_4d_causal_attention_mask_for_sdpa = _prepare_4d_causal_attention_mask_for_sdpa
+            self.original_unmask_unattended = AttentionMaskConverter._unmask_unattended
+
+        # TODO: Remove this if once transformers if much above 4.35
+        if AttentionMaskConverter is not None:
+            self.original_make_causal = AttentionMaskConverter._make_causal_mask
+
+        if _transformers_version >= version.parse("4.42"):
+            if hasattr(self._model, "model"):
+                self._update_causal_mask_original = self._model.model._update_causal_mask
+            else:
+                self._update_causal_mask_original = self._model._update_causal_mask
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index fd7e741d7c0..6a0dcbba2f0 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -121,6 +121,7 @@ class ORTModelForCausalLM(ORTModel, GenerationMixin):
 
     auto_model_class = AutoModelForCausalLM
     main_input_name = "input_ids"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index bfdfbff1b11..14bcad682c7 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1092,9 +1092,10 @@ def forward(
             model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
         if "last_hidden_state" in self.output_names:
-            last_hidden_state = model_outputs[self.output_names["last_hidden_state"]]
+            last_hidden_state = model_outputs["last_hidden_state"]
         else:
-            last_hidden_state = model_outputs[0]
+            # TODO: This allows to support sentence-transformers models (sentence embedding), but is not validated.
+            last_hidden_state = next(iter(model_outputs.values()))
 
         # converts output to namedtuple for pipelines post-processing
         return BaseModelOutput(last_hidden_state=last_hidden_state)
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 89a0ae44d58..3b1af05d0f5 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -570,6 +570,7 @@ class ORTModelForConditionalGeneration(ORTModel, ABC):
 
     # Used in from_transformers to export model to onnxORTEncoder
     base_model_prefix = "onnx_model"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/setup.py b/setup.py
index b6a5b07f932..cc88760d614 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.42.0",
+    "transformers[sentencepiece]>=4.26.0,<4.43.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569

From 16d4d7298ba721438e2bed58a6a8e586eb50519c Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 2 Jul 2024 10:07:25 +0200
Subject: [PATCH 04/12] Update dev version (#1934)

update version
---
 optimum/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/version.py b/optimum/version.py
index 6deb421ee56..8eeeb9d05a7 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.21.0.dev0"
+__version__ = "1.22.0.dev0"

From ae591be7632b1148430b884aaeb49e78ce561b8d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:02:03 +0200
Subject: [PATCH 05/12] Fix sentence transformers model patching (#1936)

fix sentence transformers modeling patching for export
---
 optimum/exporters/onnx/model_patcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 215d65549f8..f3a3ad78dbf 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -771,7 +771,8 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
 
-        self._update_causal_mask_original = self._model[0].auto_model._update_causal_mask
+        if _transformers_version >= version.parse("4.42") and self.real_config._config.model_type == "mistral":
+            self._update_causal_mask_original = self._model[0].auto_model._update_causal_mask
 
         def patched_forward(input_ids, attention_mask):
             result = self.orig_forward({"input_ids": input_ids, "attention_mask": attention_mask})

From 4e01a4a948cf48a9152f86349e82ea6cc72a0d03 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:06:45 +0200
Subject: [PATCH 06/12] Update optimum intel extra (#1935)

* update optimum intel extra

* add ipex extra
---
 setup.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index cc88760d614..519d04eab8e 100644
--- a/setup.py
+++ b/setup.py
@@ -75,10 +75,11 @@
         "transformers[sentencepiece]>=4.26.0,<4.38.0",
     ],
     "diffusers": ["diffusers"],
-    "intel": "optimum-intel>=1.16.0",
-    "openvino": "optimum-intel[openvino]>=1.16.0",
-    "nncf": "optimum-intel[nncf]>=1.16.0",
-    "neural-compressor": "optimum-intel[neural-compressor]>=1.16.0",
+    "intel": "optimum-intel>=1.18.0",
+    "openvino": "optimum-intel[openvino]>=1.18.0",
+    "nncf": "optimum-intel[nncf]>=1.18.0",
+    "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
+    "ipex": "optimum-intel[ipex]>=1.18.0",
     "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],

From f7912d64ec23a986355e9bcdf23a947e8a91acd8 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:48:06 +0200
Subject: [PATCH 07/12] Update Habana extra (#1937)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 519d04eab8e..41598aeba5f 100644
--- a/setup.py
+++ b/setup.py
@@ -80,7 +80,7 @@
     "nncf": "optimum-intel[nncf]>=1.18.0",
     "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
     "ipex": "optimum-intel[ipex]>=1.18.0",
-    "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"],
+    "habana": ["optimum-habana", "transformers >= 4.40.0, < 4.41.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
     "graphcore": "optimum-graphcore",

From f755a58e56597f690be4a0c4bdb549ce0ffd4e03 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 2 Jul 2024 18:50:36 +0200
Subject: [PATCH 08/12] Remove inplace op in mistral patcher (#1938)

remove bitwise inplace op
---
 optimum/exporters/onnx/model_patcher.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index f3a3ad78dbf..5e720d0cd7d 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -1031,9 +1031,10 @@ def _update_causal_mask_patched(
         if self.config.sliding_window is not None:
             if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
                 # ---------------- NOTE: This part is patched -----------------------------
-                exclude_mask.bitwise_or_(
+                exclude_mask = torch.bitwise_or(
+                    exclude_mask,
                     torch.arange(target_length, device=device)
-                    <= (cache_position.reshape(-1, 1) - self.config.sliding_window)
+                    <= (cache_position.reshape(-1, 1) - self.config.sliding_window),
                 )
                 # ---------------- NOTE: patch end ----------------------------------------
 

From 5c803db8cef21b22d0bdbf8a69653b74656e193e Mon Sep 17 00:00:00 2001
From: Yue Fei <luxun.fy@alibaba-inc.com>
Date: Fri, 5 Jul 2024 00:53:45 +0800
Subject: [PATCH 09/12] Fix forward bug in ORTModelForFeatureExtraction (#1941)

Fix forward bug for ORTModelForFeatureExtraction
---
 optimum/onnxruntime/modeling_ort.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 14bcad682c7..126b1e65366 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1091,11 +1091,11 @@ def forward(
             onnx_outputs = self.model.run(None, onnx_inputs)
             model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
-        if "last_hidden_state" in self.output_names:
-            last_hidden_state = model_outputs["last_hidden_state"]
-        else:
-            # TODO: This allows to support sentence-transformers models (sentence embedding), but is not validated.
-            last_hidden_state = next(iter(model_outputs.values()))
+            if "last_hidden_state" in self.output_names:
+                last_hidden_state = model_outputs["last_hidden_state"]
+            else:
+                # TODO: This allows to support sentence-transformers models (sentence embedding), but is not validated.
+                last_hidden_state = next(iter(model_outputs.values()))
 
         # converts output to namedtuple for pipelines post-processing
         return BaseModelOutput(last_hidden_state=last_hidden_state)

From 6e2e56412901b2d716fdbfc536a3e9d6c31ee9d8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 5 Jul 2024 13:57:14 +0200
Subject: [PATCH 10/12] Update README code snippet (#1944)

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c892a142994..9a6403cdacb 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,7 @@ It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO
 optimum-cli export openvino --model distilbert-base-uncased-finetuned-sst-2-english distilbert_sst2_ov
 ```
 
-If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#weight-only-quantization) for more detail on weight only quantization. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#static-quantization).
-
+If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/export) for more detail. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/intel/openvino/optimization#static-quantization).
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -92,13 +91,13 @@ To load a model and run inference with OpenVINO Runtime, you can just replace yo
   model_id = "distilbert-base-uncased-finetuned-sst-2-english"
   tokenizer = AutoTokenizer.from_pretrained(model_id)
 - model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained("distilbert_sst2_ov")
++ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 
   classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
   results = classifier("He's a dreadful magician.")
 ```
 
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
+You can find more examples in the [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
 
 ### Neural Compressor
 

From 4aa0c143ed0c8134f8a37690f2191365876f39b2 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 8 Jul 2024 14:04:28 +0200
Subject: [PATCH 11/12] Deprecate ORTModel class (#1939)

* deprecate and create alternative

* update optimization examples

* update quant examples

* fix
---
 .../optimization/multiple-choice/run_swag.py  | 19 +++---
 .../optimization/question-answering/run_qa.py | 24 +++----
 .../text-classification/README.md             |  6 +-
 .../text-classification/run_glue.py           | 25 +++++---
 .../token-classification/run_ner.py           | 17 +++--
 .../run_image_classification.py               | 17 ++---
 .../quantization/multiple-choice/run_swag.py  | 14 ++---
 .../quantization/question-answering/README.md |  8 +--
 .../quantization/question-answering/run_qa.py | 20 +++---
 .../text-classification/README.md             |  5 +-
 .../text-classification/run_glue.py           | 23 +++----
 .../token-classification/README.md            |  5 +-
 .../token-classification/run_ner.py           | 22 +++----
 optimum/onnxruntime/model.py                  |  5 ++
 optimum/onnxruntime/utils.py                  | 62 ++++++++++++++++++-
 15 files changed, 165 insertions(+), 107 deletions(-)

diff --git a/examples/onnxruntime/optimization/multiple-choice/run_swag.py b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
index 3c43846b9a5..bcddc975907 100644
--- a/examples/onnxruntime/optimization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -236,7 +236,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -254,13 +253,18 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForMultipleChoice.from_pretrained(
+        optimized_model_path,
+        provider=optim_args.execution_provider,
+    )
+
     if training_args.do_eval:
         # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
         # prediction step(s)
@@ -339,13 +343,12 @@ def compute_metrics(eval_predictions):
         # Evaluation
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/optimization/question-answering/run_qa.py b/examples/onnxruntime/optimization/question-answering/run_qa.py
index 04a9bd34f36..407714cb01f 100644
--- a/examples/onnxruntime/optimization/question-answering/run_qa.py
+++ b/examples/onnxruntime/optimization/question-answering/run_qa.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -305,7 +305,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -323,13 +322,15 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForQuestionAnswering.from_pretrained(optimized_model_path, provider=optim_args.execution_provider)
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -478,13 +479,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -514,12 +514,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/optimization/text-classification/README.md b/examples/onnxruntime/optimization/text-classification/README.md
index 42a99cc73d3..3a7dce2b59f 100644
--- a/examples/onnxruntime/optimization/text-classification/README.md
+++ b/examples/onnxruntime/optimization/text-classification/README.md
@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Text classification 
+# Text classification
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py)
-allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies graph optimization on a DistilBERT fine-tuned on the sst-2 task. Here the optimization level is selected to be 1, enabling basic optimizations such as redundant node eliminations and constant folding. Higher optimization level will result in hardware dependent optimized graph.
 
diff --git a/examples/onnxruntime/optimization/text-classification/run_glue.py b/examples/onnxruntime/optimization/text-classification/run_glue.py
index a07193915b8..222dda15074 100644
--- a/examples/onnxruntime/optimization/text-classification/run_glue.py
+++ b/examples/onnxruntime/optimization/text-classification/run_glue.py
@@ -42,7 +42,7 @@
 
 from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -250,7 +250,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
 
@@ -268,13 +267,17 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForSequenceClassification.from_pretrained(
+        optimized_model_path, provider=optim_args.execution_provider
+    )
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -408,13 +411,13 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the evaluation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -436,10 +439,12 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the test dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path, execution_provider=optim_args.execution_provider, label_names=["label"]
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/optimization/token-classification/run_ner.py b/examples/onnxruntime/optimization/token-classification/run_ner.py
index 73db3671d2f..2e7b63792c3 100644
--- a/examples/onnxruntime/optimization/token-classification/run_ner.py
+++ b/examples/onnxruntime/optimization/token-classification/run_ner.py
@@ -38,7 +38,7 @@
 
 from optimum.onnxruntime import ORTModelForTokenClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -276,7 +276,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -480,12 +479,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the validation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -509,12 +507,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the prediction dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/examples/onnxruntime/quantization/image-classification/run_image_classification.py b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
index 3d0fa72882e..6feaaef4f3b 100644
--- a/examples/onnxruntime/quantization/image-classification/run_image_classification.py
+++ b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
@@ -22,7 +22,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -38,7 +37,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForImageClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -47,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 logger = logging.getLogger(__name__)
@@ -378,13 +377,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForImageClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -409,13 +411,12 @@ def compute_metrics(p: EvalPrediction):
         # Set the validation transforms
         eval_dataset = eval_dataset.with_transform(preprocess_function)
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=[labels_column],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
diff --git a/examples/onnxruntime/quantization/multiple-choice/run_swag.py b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
index 9d9642c12d5..9a8423f836d 100644
--- a/examples/onnxruntime/quantization/multiple-choice/run_swag.py
+++ b/examples/onnxruntime/quantization/multiple-choice/run_swag.py
@@ -38,7 +38,6 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
     ExcludeGeLUNodes,
@@ -46,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -409,13 +409,14 @@ def compute_metrics(eval_predictions):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForMultipleChoice.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
@@ -436,13 +437,12 @@ def compute_metrics(eval_predictions):
                 load_from_cache_file=not data_args.overwrite_cache,
             )
 
-        ort_model = ORTModel(
-            os.path.join(training_args.output_dir, "model_quantized.onnx"),
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
diff --git a/examples/onnxruntime/quantization/question-answering/README.md b/examples/onnxruntime/quantization/question-answering/README.md
index 380afff8cad..8345ca8e4d0 100644
--- a/examples/onnxruntime/quantization/question-answering/README.md
+++ b/examples/onnxruntime/quantization/question-answering/README.md
@@ -16,13 +16,9 @@ limitations under the License.
 
 # Question answering
 
+The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
 
-The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks.
-
-Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along 
-the flag `--version_2_with_negative`.
+Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the SQuAD1.0 dataset.
 
diff --git a/examples/onnxruntime/quantization/question-answering/run_qa.py b/examples/onnxruntime/quantization/question-answering/run_qa.py
index 4a6a854fd97..50661b7b420 100644
--- a/examples/onnxruntime/quantization/question-answering/run_qa.py
+++ b/examples/onnxruntime/quantization/question-answering/run_qa.py
@@ -24,7 +24,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -39,7 +38,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForQuestionAnswering
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -48,6 +46,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -651,25 +650,25 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForQuestionAnswering.from_pretrained(quantized_model_path, provider=optim_args.execution_provider)
 
     # Evaluation
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -681,12 +680,11 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["start_positions", "end_positions"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
diff --git a/examples/onnxruntime/quantization/text-classification/README.md b/examples/onnxruntime/quantization/text-classification/README.md
index 460bb56fba8..95fd3335171 100644
--- a/examples/onnxruntime/quantization/text-classification/README.md
+++ b/examples/onnxruntime/quantization/text-classification/README.md
@@ -18,10 +18,7 @@ limitations under the License.
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the sst-2 task.
 
diff --git a/examples/onnxruntime/quantization/text-classification/run_glue.py b/examples/onnxruntime/quantization/text-classification/run_glue.py
index bc141b2194f..4b9ee0403c3 100644
--- a/examples/onnxruntime/quantization/text-classification/run_glue.py
+++ b/examples/onnxruntime/quantization/text-classification/run_glue.py
@@ -23,7 +23,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -44,7 +43,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForSequenceClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -53,6 +51,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -476,13 +475,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForSequenceClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -504,13 +506,13 @@ def compute_metrics(p: EvalPrediction):
                 f" Evaluation results may suffer from a wrong matching."
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -525,12 +527,11 @@ def compute_metrics(p: EvalPrediction):
         if data_args.max_predict_samples is not None:
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions
diff --git a/examples/onnxruntime/quantization/token-classification/README.md b/examples/onnxruntime/quantization/token-classification/README.md
index f56388ed3c0..540b3cbe2dd 100644
--- a/examples/onnxruntime/quantization/token-classification/README.md
+++ b/examples/onnxruntime/quantization/token-classification/README.md
@@ -16,10 +16,7 @@ limitations under the License.
 
 # Token classification
 
-
-The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py)
-allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph 
-optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks. 
+The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks.
 
 The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the CoNLL-2003 task
 
diff --git a/examples/onnxruntime/quantization/token-classification/run_ner.py b/examples/onnxruntime/quantization/token-classification/run_ner.py
index 1cc12d3fbc0..3a5798c57a8 100644
--- a/examples/onnxruntime/quantization/token-classification/run_ner.py
+++ b/examples/onnxruntime/quantization/token-classification/run_ner.py
@@ -25,7 +25,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -40,7 +39,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForTokenClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -49,6 +47,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -551,13 +550,16 @@ def compute_metrics(p):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForTokenClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -572,12 +574,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the validation dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -602,12 +603,11 @@ def compute_metrics(p):
                 desc="Running tokenizer on the prediction dataset",
             )
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index 23ca6e5e6a6..caa662f3824 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -49,6 +49,11 @@ def __init__(
             label_names (`List[str]`, `optional`):
                 The list of keys in your dictionary of inputs that correspond to the labels.
         """
+
+        logger.warning(
+            "The class `optimum.onnxruntime.model.ORTModel` is deprecated and will be removed in the next release."
+        )
+
         self.compute_metrics = compute_metrics
         self.label_names = ["labels"] if label_names is None else label_names
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 37d0feefcc4..ad40af92b9d 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -17,11 +17,15 @@
 import re
 from enum import Enum
 from inspect import signature
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from packaging import version
+from tqdm import tqdm
+from transformers import EvalPrediction
+from transformers.trainer_pt_utils import nested_concat
+from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import logging
 
 import onnxruntime as ort
@@ -30,6 +34,12 @@
 from ..utils.import_utils import _is_package_available
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+    from .modeling_ort import ORTModel
+
+
 logger = logging.get_logger(__name__)
 
 ONNX_WEIGHTS_NAME = "model.onnx"
@@ -341,3 +351,53 @@ class ORTQuantizableOperator(Enum):
     Resize = "Resize"
     AveragePool = "AveragePool"
     Concat = "Concat"
+
+
+def evaluation_loop(
+    model: "ORTModel",
+    dataset: "Dataset",
+    label_names: Optional[List[str]] = None,
+    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+):
+    """
+    Run evaluation and returns metrics and predictions.
+
+    Args:
+        model (`ORTModel`):
+            The ONNXRuntime model to use for the evaluation step.
+        dataset (`datasets.Dataset`):
+            Dataset to use for the evaluation step.
+        label_names (`List[str]`, `optional`):
+            The list of keys in your dictionary of inputs that correspond to the labels.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, `optional`):
+            The function that will be used to compute metrics at evaluation. Must take an `EvalPrediction` and
+            return a dictionary string to metric values.
+    """
+
+    all_preds = None
+    all_labels = None
+
+    for inputs in tqdm(dataset, desc="Evaluation"):
+        has_labels = all(inputs.get(k) is not None for k in label_names)
+        if has_labels:
+            labels = tuple(np.array([inputs.get(name)]) for name in label_names)
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        inputs = {key: np.array([inputs[key]]) for key in model.input_names if key in inputs}
+        preds = model(**inputs)
+
+        if len(preds) == 1:
+            preds = preds[0]
+
+        all_preds = preds if all_preds is None else nested_concat(all_preds, preds, padding_index=-100)
+        all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+    if compute_metrics is not None and all_preds is not None and all_labels is not None:
+        metrics = compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+    else:
+        metrics = {}
+
+    return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))

From 171020c775cec6ff77826c3f5f5e5c1498b23f81 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 8 Jul 2024 14:05:47 +0200
Subject: [PATCH 12/12] Remove warning (#1945)

Remove incorrect warning
---
 optimum/modeling_base.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 74b05d5b151..6f3f641b439 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -415,13 +415,6 @@ def from_pretrained(
                 trust_remote_code=trust_remote_code,
             )
 
-        if not export and trust_remote_code:
-            logger.warning(
-                "The argument `trust_remote_code` is to be used along with export=True. It will be ignored."
-            )
-        elif export and trust_remote_code is None:
-            trust_remote_code = False
-
         from_pretrained_method = cls._from_transformers if export else cls._from_pretrained
 
         return from_pretrained_method(