From 35d35bd3da52273cdd8fd2a8300b4598b2d96cc7 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Thu, 19 Dec 2024 14:14:56 +0400
Subject: [PATCH 1/6] Move check_dummy_inputs_allowed to common export utils
 (#2114)

* move check_dummy_inputs_allowed to common export utils

* move decoder_merge import

* Update optimum/exporters/utils.py

* Update optimum/exporters/utils.py

* avoid onnx import if not necessary

* move merge decoders import

* fix style

* add comment

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Co-authored-by: Ella Charlaix <ella@huggingface.co>
---
 optimum/exporters/onnx/base.py          | 12 ++++++----
 optimum/exporters/onnx/config.py        |  6 ++++-
 optimum/exporters/onnx/convert.py       | 29 ++++---------------------
 optimum/exporters/onnx/model_configs.py |  6 ++++-
 optimum/exporters/utils.py              | 27 ++++++++++++++++++++++-
 5 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index 7e35691d54b..b5adb4522a2 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -27,16 +27,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
-import onnx
 from transformers.utils import is_accelerate_available, is_torch_available
 
-from ...onnx import remove_duplicate_weights_from_tied_info
-
 
 if is_torch_available():
     import torch.nn as nn
 
-from ...onnx import merge_decoders
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     DummyInputGenerator,
@@ -54,6 +50,8 @@
 from .model_patcher import ModelPatcher, Seq2SeqModelPatcher
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
 if is_accelerate_available():
     from accelerate.utils import find_tied_parameters
 
@@ -542,6 +540,10 @@ def post_process_exported_models(
         first_key = next(iter(models_and_onnx_configs))
         if is_torch_available() and isinstance(models_and_onnx_configs[first_key][0], nn.Module):
             if is_accelerate_available():
+                import onnx
+
+                from ...onnx import remove_duplicate_weights_from_tied_info
+
                 logger.info("Deduplicating shared (tied) weights...")
                 for subpath, key in zip(onnx_files_subpaths, models_and_onnx_configs):
                     torch_model = models_and_onnx_configs[key][0]
@@ -934,6 +936,8 @@ def post_process_exported_models(
             decoder_with_past_path = Path(path, onnx_files_subpaths[2])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
             try:
+                from ...onnx import merge_decoders
+
                 # The decoder with past does not output the cross attention past key values as they are constant,
                 # hence the need for strict=False
                 merge_decoders(
diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
index 9e808e392b9..69366d6be13 100644
--- a/optimum/exporters/onnx/config.py
+++ b/optimum/exporters/onnx/config.py
@@ -20,7 +20,6 @@
 
 from transformers.utils import is_tf_available
 
-from ...onnx import merge_decoders
 from ...utils import (
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
@@ -38,6 +37,9 @@
 from .model_patcher import DecoderModelPatcher
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
 
@@ -129,6 +131,8 @@ def post_process_exported_models(
 
         # Attempt to merge only if the decoder-only was exported separately without/with past
         if self.use_past is True and len(models_and_onnx_configs) == 2:
+            from ...onnx import merge_decoders
+
             decoder_path = Path(path, onnx_files_subpaths[0])
             decoder_with_past_path = Path(path, onnx_files_subpaths[1])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index c12a9ac222a..80d945580c7 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -22,7 +22,7 @@
 from inspect import signature
 from itertools import chain
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import onnx
@@ -45,6 +45,7 @@
 from ...utils.save_utils import maybe_save_preprocessors
 from ..error_utils import AtolError, MinimumVersionError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
+from ..utils import check_dummy_inputs_are_allowed
 from .base import OnnxConfig
 from .constants import UNPICKABLE_ARCHS
 from .model_configs import SpeechT5OnnxConfig
@@ -56,6 +57,8 @@
 )
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -75,30 +78,6 @@ class DynamicAxisNameError(ValueError):
     pass
 
 
-def check_dummy_inputs_are_allowed(
-    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str]
-):
-    """
-    Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`.
-    Args:
-        model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]):
-            The model instance.
-        model_inputs (`Iterable[str]`):
-            The model input names.
-    """
-
-    forward = model.forward if is_torch_available() and isinstance(model, nn.Module) else model.call
-    forward_parameters = signature(forward).parameters
-    forward_inputs_set = set(forward_parameters.keys())
-    dummy_input_names = set(dummy_input_names)
-
-    # We are fine if config_inputs has more keys than model_inputs
-    if not dummy_input_names.issubset(forward_inputs_set):
-        raise ValueError(
-            f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}"
-        )
-
-
 def validate_models_outputs(
     models_and_onnx_configs: Dict[
         str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 4c5a727a183..3a48a579c2c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -21,7 +21,6 @@
 from packaging import version
 from transformers.utils import is_tf_available
 
-from ...onnx import merge_decoders
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     BloomDummyPastKeyValuesGenerator,
@@ -94,6 +93,9 @@
 )
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
     from transformers.modeling_utils import PreTrainedModel
@@ -2018,6 +2020,8 @@ def post_process_exported_models(
             decoder_with_past_path = Path(path, onnx_files_subpaths[3])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
             try:
+                from ...onnx import merge_decoders
+
                 # The decoder with past does not output the cross attention past key values as they are constant,
                 # hence the need for strict=False
                 merge_decoders(
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index 60de169de5e..d4a4111075d 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -16,7 +16,8 @@
 """Utilities for model preparation to export."""
 
 import copy
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from inspect import signature
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from packaging import version
@@ -675,3 +676,27 @@ def _get_submodels_and_export_configs(
         export_config = next(iter(models_and_export_configs.values()))[1]
 
     return export_config, models_and_export_configs
+
+
+def check_dummy_inputs_are_allowed(
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str]
+):
+    """
+    Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`.
+    Args:
+        model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]):
+            The model instance.
+        model_inputs (`Iterable[str]`):
+            The model input names.
+    """
+
+    forward = model.forward if is_torch_available() and isinstance(model, torch.nn.Module) else model.call
+    forward_parameters = signature(forward).parameters
+    forward_inputs_set = set(forward_parameters.keys())
+    dummy_input_names = set(dummy_input_names)
+
+    # We are fine if config_inputs has more keys than model_inputs
+    if not dummy_input_names.issubset(forward_inputs_set):
+        raise ValueError(
+            f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}"
+        )

From 0ea269fb714877b5006e3293026d397de8d53767 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 19 Dec 2024 11:15:28 +0100
Subject: [PATCH 2/6] Remove CI macos runners (#2129)

remove macos runners
---
 .github/workflows/test_bettertransformer.yml | 2 +-
 .github/workflows/test_onnxruntime.yml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
index b023fa4bd1b..016e97304ad 100644
--- a/.github/workflows/test_bettertransformer.yml
+++ b/.github/workflows/test_bettertransformer.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.9]
-        os: [ubuntu-20.04, macos-14]
+        os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index b20a3b46f88..a0c5893d62c 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         transformers-version: ["latest"]
-        os: [ubuntu-20.04, windows-2019, macos-15]
+        os: [ubuntu-20.04, windows-2019] # TODO : add macos-15 after mps fix
         include:
           - transformers-version: "4.36.*"
             os: ubuntu-20.04

From 21de42f05c297e6d165def90a5db95d5637b6d6c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 19 Dec 2024 18:30:09 +0800
Subject: [PATCH 3/6] Enable GPTQModel (#2064)

* align gptq check to transformers for supporting cpu

* fix comment

* gptqmodel

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* compatible with auto-gptq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix compatible with auto-gptq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix compatible with auto-gptq linear

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert unrelated changes

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* gptqmodel need use checkpoint_format  (#1)

* need checkpoint_format

* default value of checkpoint_format is gptq

* fix quantize

* fix quantize

* fix quantize

* Update quantizer.py

* need convert to v1 before gptqmodel save

* back checkpoint_format to gptq after convert

* cleanup code

* sym=False is not supported with auto-gptq

* add comments

* cleanup code

* Update quantizer.py

* always convert v2 to v1 if checkpoint_format = "gptq"

* Update quantizer.py

---------

Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai>
Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai>

* Mod backend code (#2)

* keep gptq_v2 if sym is false

* use hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format, and hf_gptqmodel_post_init

* no need check backend

* use device_map

* cleanup

* Update quantizer.py

* move import

---------

Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai>

* fix format and log

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix version check

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* enable gptqmodel tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update check quant type

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* Fix optimum compat (#3)

* add meta info

* cleanup

* cleanup

* The value of quantizer should be an array

* Update quantizer.py

* If is_auto_gptq_available() also writes "auto_gptq:version" to "quantizer"

* If is_auto_gptq_available() also writes "auto_gptq:version" to "quantizer"

* Update quantizer.py

* cleanup

* comment on meta

* hf_select_quant_linear pass checkpoint_format

* add todo fix

* move convert code to quantizer.save()

* Update quantizer.py

* Optimize hf_convert_gptq_v2_to_v1_format()

* Optimize hf_convert_gptq_v1_to_v2_format()

* fix GPTQTestCUDA

* hf_select_quant_linear() always set pack=True

* gptqmodel.hf_select_quant_linear() now does not select ExllamaV2

* gptqmodel.hf_select_quant_linear() now does not select ExllamaV2

* GPTQQuantizer add backend

* lower checkpoint_format and backend

* cleanup

* move backend to bottom

* no need to check gptqmodel version for ipex support

* Update import_utils.py

* Update quantizer.py

* fix UnboundLocalError: cannot access local variable 'version' where it is not associated with a value

* make version var short

* Update import_utils.py

* fix unittest

* use assertLessEqual

---------

Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Co-authored-by: LRL <lrl@lbx.dev>

* fix format and convert v2 to v1

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* [Fix] all tensors not same device (#5)

* fix device error

* update gptqmodel version

* fix test

* fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* add gptqmodel tests which contains cpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix all auto-gptq tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* rm gptqmodel yaml

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix comment

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* enable real cpu tests by fp32

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix test model name

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* keep the original device setting when using auto-gptq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
Co-authored-by: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com>
Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai>
Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Co-authored-by: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com>
Co-authored-by: LRL <lrl@lbx.dev>
Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 optimum/gptq/quantizer.py     | 253 ++++++++++++++++++++++++++--------
 optimum/gptq/utils.py         |  15 ++
 optimum/utils/__init__.py     |   1 +
 optimum/utils/import_utils.py |  19 ++-
 4 files changed, 227 insertions(+), 61 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 849d8821ebf..844da3e3157 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import json
 import os
 from enum import Enum
@@ -19,17 +20,26 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
+from packaging import version
 from torch import nn
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer
 from transformers.pytorch_utils import Conv1D
 from transformers.utils.quantization_config import QuantizationMethod
 
-from ..utils import is_accelerate_available, is_auto_gptq_available
+from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available
 from ..utils.modeling_utils import recurse_getattr
+from ..version import __version__ as optimum_version
 from .constants import GPTQ_CONFIG
 from .data import get_dataset, prepare_dataset
-from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
+from .utils import (
+    get_block_name_with_pattern,
+    get_device,
+    get_layers,
+    get_preceding_modules,
+    get_seqlen,
+    nested_move_to,
+)
 
 
 if is_accelerate_available():
@@ -40,14 +50,27 @@
     from accelerate.hooks import remove_hook_from_module
 
 if is_auto_gptq_available():
+    from auto_gptq import __version__ as autogptq_version
     from auto_gptq import exllama_set_max_input_length
-    from auto_gptq.modeling._utils import autogptq_post_init
+    from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init
     from auto_gptq.quantization import GPTQ
-    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
+
+if is_gptqmodel_available():
+    from gptqmodel import exllama_set_max_input_length
+    from gptqmodel.quantization import GPTQ
+    from gptqmodel.utils.importer import hf_select_quant_linear
+    from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format
+    from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init
+    from gptqmodel.version import __version__ as gptqmodel_version
 
 logger = getLogger(__name__)
 
 
+def has_device_more_than_cpu():
+    return torch.cuda.is_available() or (hasattr(torch, "xpu") and torch.xpu.is_available())
+
+
 class ExllamaVersion(int, Enum):
     ONE = 1
     TWO = 2
@@ -74,10 +97,13 @@ def __init__(
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
         disable_exllama: bool = False,
-        exllama_config: Dict[str, Any] = None,
+        exllama_config: Optional[Dict[str, Any]] = None,
         max_input_length: Optional[int] = None,
         cache_block_outputs: Optional[bool] = True,
         modules_in_block_to_quantize: Optional[List[List[str]]] = None,
+        checkpoint_format: str = "gptq",
+        meta: Optional[Dict[str, any]] = None,
+        backend: Optional[str] = None,
         *args,
         **kwargs,
     ):
@@ -129,6 +155,13 @@ def __init__(
                 List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
                 The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
                 If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
+            checkpoint_format (`str`, *optional*, defaults to `gptq`):
+                GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
+            meta (`Dict[str, any]`, *optional*):
+                Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
+                i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
+            backend (`str`, *optional*):
+                Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
         """
 
         self.bits = bits
@@ -138,6 +171,9 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.checkpoint_format = checkpoint_format.lower()
+        self.meta = meta
+        self.backend = backend.lower() if backend is not None else None
         self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
@@ -161,6 +197,8 @@ def __init__(
             "true_sequential",
             "quant_method",
             "modules_in_block_to_quantize",
+            "checkpoint_format",
+            "meta",
         ]
 
         if self.bits not in [2, 3, 4, 8]:
@@ -182,6 +220,28 @@ def __init__(
                 )
         self.exllama_version = self.exllama_config["version"]
 
+    def select_quant_linear(self, device_map: Union[str, dict]):
+        if is_gptqmodel_available():
+            self.quant_linear = hf_select_quant_linear(
+                bits=self.bits,
+                group_size=self.group_size,
+                desc_act=self.desc_act,
+                sym=self.sym,
+                checkpoint_format=self.checkpoint_format,
+                meta=self.meta,
+                device_map=device_map,
+                backend=self.backend,
+            )
+        else:
+            self.quant_linear = hf_select_quant_linear(
+                use_triton=False,
+                desc_act=self.desc_act,
+                group_size=self.group_size,
+                bits=self.bits,
+                disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+                disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+            )
+
     def to_dict(self):
         """
         Returns the args in dict format.
@@ -189,6 +249,20 @@ def to_dict(self):
         gptq_dict = {}
         for key in self.serialization_keys:
             gptq_dict[key] = getattr(self, key)
+
+        if gptq_dict.get("meta") is None:
+            gptq_dict["meta"] = {}
+
+        meta = gptq_dict["meta"]
+        # store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer
+        if meta.get("quantizer") is None:
+            meta["quantizer"] = [f"optimum:{optimum_version}"]
+
+            if is_gptqmodel_available():
+                meta["quantizer"].append(f"gptqmodel:{gptqmodel_version}")
+            elif is_auto_gptq_available():
+                meta["quantizer"].append(f"auto_gptq:{autogptq_version}")
+
         return gptq_dict
 
     @classmethod
@@ -205,7 +279,7 @@ def from_dict(cls, config_dict: Dict[str, Any]):
         """
         return cls(**config_dict)
 
-    def convert_model(self, model: nn.Module):
+    def convert_model(self, model: nn.Module, **kwargs):
         """
         Convert the model to a GPTQ model by getting and replacing the layers.
 
@@ -226,7 +300,11 @@ def convert_model(self, model: nn.Module):
                         f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
                     )
                     del layers_to_be_replaced[name]
+
+        self.select_quant_linear(device_map=kwargs.get("device_map", None))
+
         self._replace_by_quant_layers(model, layers_to_be_replaced)
+
         return model
 
     def get_no_split_module_classes(self, model):
@@ -253,15 +331,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             name (`str`, defaults to `""`):
                 To keep track of the name of the current module
         """
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
-            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
-        )
-        if isinstance(module, QuantLinear):
+        if isinstance(module, self.quant_linear):
             return
         for attr in dir(module):
             layer = getattr(module, attr)
@@ -279,20 +349,37 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
                     in_features = layer.weight.shape[0]
                     out_features = layer.weight.shape[1]
                 bias = layer.bias is not None
-                if not (self.desc_act) or self.group_size == -1:
-                    new_layer = QuantLinear(
+                if is_gptqmodel_available():
+                    new_layer = self.quant_linear(
                         self.bits,
                         self.group_size,
+                        self.desc_act,
+                        self.sym,
                         in_features,
                         out_features,
                         bias,
-                        use_cuda_fp16=self.use_cuda_fp16,
                         weight_dtype=layer.weight.dtype,
                     )
                 else:
-                    new_layer = QuantLinear(
-                        self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-                    )
+                    if not (self.desc_act) or self.group_size == -1:
+                        new_layer = self.quant_linear(
+                            self.bits,
+                            self.group_size,
+                            in_features,
+                            out_features,
+                            bias,
+                            use_cuda_fp16=self.use_cuda_fp16,
+                            weight_dtype=layer.weight.dtype,
+                        )
+                    else:
+                        new_layer = self.quant_linear(
+                            self.bits,
+                            self.group_size,
+                            in_features,
+                            out_features,
+                            bias,
+                            weight_dtype=layer.weight.dtype,
+                        )
                 new_layer.device = device
                 setattr(module, attr, new_layer.to(device))
         for name1, child in module.named_children():
@@ -318,13 +405,41 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
             `nn.Module`: The quantized model
         """
 
-        if not is_auto_gptq_available():
-            raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
-        if not torch.cuda.is_available():
-            raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
+        if not is_auto_gptq_available() and not is_gptqmodel_available():
+            raise RuntimeError(
+                "gptqmodel or auto-gptq is required in order to perform gptq quantzation: `pip install gptqmodel` or `pip install auto-gptq`. Please notice that auto-gptq will be deprecated in the future."
+            )
+        elif is_gptqmodel_available() and is_auto_gptq_available():
+            logger.warning(
+                "Detected gptqmodel and auto-gptq, will use gptqmodel. The auto_gptq will be deprecated in the future."
+            )
+
+        gptq_supports_cpu = (
+            is_auto_gptq_available()
+            and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        ) or is_gptqmodel_available()
+
+        if not gptq_supports_cpu and not torch.cuda.is_available():
+            raise RuntimeError(
+                "No cuda gpu or cpu support using Intel/IPEX found. A gpu or cpu with Intel/IPEX is required for quantization."
+            )
+
+        if not self.sym and not is_gptqmodel_available():
+            raise ValueError(
+                "Asymmetric sym=False quantization is not supported with auto-gptq. Please use gptqmodel: `pip install gptqmodel`"
+            )
+
+        if self.checkpoint_format == "gptq_v2" and not is_gptqmodel_available():
+            raise ValueError(
+                "gptq_v2 format only supported with gptqmodel. Please install gptqmodel: `pip install gptqmodel`"
+            )
 
         model.eval()
 
+        # gptqmodel internal is gptq_v2 for asym support, gptq(v1) can only support sym=True
+        if is_gptqmodel_available() and self.checkpoint_format != "gptq_v2":
+            self.checkpoint_format = "gptq_v2"
+
         # For Transformer model
         has_config = False
         has_device_map = False
@@ -403,27 +518,32 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
 
         blocks = recurse_getattr(model, self.block_name_to_quantize)
 
+        cur_layer_device = get_device(blocks[0])
+        if not is_gptqmodel_available():
+            cur_layer_device = 0
+
         if not has_device_map:
-            # put modules from module_name_preceding_first_block on cuda
+            # put modules from module_name_preceding_first_block on cuda or xpu or cpu
+            to_device = cur_layer_device
             for module_name in self.module_name_preceding_first_block:
                 module = recurse_getattr(model, module_name)
                 if module is None:
                     raise ValueError(f"Module {module_name} was not found in model")
-                module = module.to(0)
-            blocks[0] = blocks[0].to(0)
+                module = module.to(to_device)
+            blocks[0] = blocks[0].to(to_device)
 
         def store_input_hook(_, input, *args):
             kwargs = args[0]
             if input is None:
                 if "hidden_states" in kwargs:
-                    input = (kwargs["hidden_states"],)
+                    input = (nested_move_to(kwargs["hidden_states"], cur_layer_device),)
                 else:
                     raise ValueError("No input value found in the foward pass")
             layer_inputs.append(input)
             other_kwargs = {}
             for k, v in kwargs.items():  # make sure other arguments also be captured
                 if k not in ["hidden_states"]:
-                    other_kwargs[k] = v
+                    other_kwargs[k] = nested_move_to(v, cur_layer_device)
             layer_input_kwargs.append(other_kwargs)
             raise ValueError
 
@@ -431,11 +551,7 @@ def store_input_hook(_, input, *args):
             handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
             for data in dataset:
                 for k, v in data.items():
-                    # put the data on gpu, we won't put them back to cpu
-                    if not has_device_map or device.type == "cpu":
-                        data[k] = v.to(0)
-                    else:
-                        data[k] = v.to(device)
+                    data[k] = nested_move_to(v, cur_layer_device)
                 try:
                     model(**data)
                 except ValueError:
@@ -450,6 +566,8 @@ def store_input_hook(_, input, *args):
                     raise ValueError(f"Module {module_name} was not found in model")
 
         torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
 
         # Step 3: Quantize the blocks
         quantizers = {}
@@ -460,11 +578,7 @@ def store_input_hook(_, input, *args):
                 handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
                 for data in dataset:
                     for k, v in data.items():
-                        # put the data on gpu, we won't put them back to cpu
-                        if not has_device_map or device.type == "cpu":
-                            data[k] = v.to(0)
-                        else:
-                            data[k] = v.to(device)
+                        data[k] = nested_move_to(v, cur_layer_device)
                     try:
                         model(**data)
                     except ValueError:
@@ -473,9 +587,12 @@ def store_input_hook(_, input, *args):
 
             # move block to cuda if needed
             # in case we have offload modules, we need to put them on cuda because of GPTQ object
-            if not has_device_map or get_device(block) == torch.device("cpu"):
+            if (not has_device_map or get_device(block) == torch.device("cpu")) and has_device_more_than_cpu():
                 block = block.to(0)
             layers = get_layers(block)
+            block_device = get_device(block)
+            if not is_gptqmodel_available():
+                block_device = 0
             if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
                 if self.true_sequential:
                     layers_name_list = self.modules_in_block_to_quantize
@@ -509,15 +626,20 @@ def tmp(_, input, output):
                 for j in range(len(dataset)):
                     # the args are already on the gpu
                     # don't need to store the output
+                    layer_inputs[j] = nested_move_to(layer_inputs[j], block_device)
+                    for k, v in layer_input_kwargs[j].items():
+                        layer_input_kwargs[j][k] = nested_move_to(v, block_device)
+
                     block(*layer_inputs[j], **layer_input_kwargs[j])
                 # remove hook
                 for h in handles:
                     h.remove()
                 for name in subset_name_list:
                     logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
-                    scale, zero, g_idx = gptq[name].fasterquant(
+                    quant_outputs = gptq[name].fasterquant(
                         percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
                     )
+                    scale, zero, g_idx = quant_outputs[0], quant_outputs[1], quant_outputs[2]
                     quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
                         gptq[name].quantizer,
                         scale,
@@ -543,11 +665,13 @@ def tmp(_, input, output):
                 del layer_inputs
                 layer_inputs = []
             torch.cuda.empty_cache()
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
         if self.bits == 4:
             # device not on gpu
             if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])):
-                if not self.disable_exllama:
+                if not self.disable_exllama and not is_gptqmodel_available():
                     logger.warning(
                         "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
                     )
@@ -578,6 +702,8 @@ def tmp(_, input, output):
         model = self.post_init_model(model)
 
         torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
         return model
 
     def post_init_model(self, model):
@@ -601,9 +727,14 @@ def post_init_model(self, model):
         class StoreAttr(object):
             pass
 
+        if is_gptqmodel_available():
+            model, _ = hf_convert_gptq_v1_to_v2_format(
+                model, self.bits, self.quant_linear, self.checkpoint_format, self.meta
+            )
+
         model.quantize_config = StoreAttr()
         model.quantize_config.desc_act = self.desc_act
-        model = autogptq_post_init(model, use_act_order=self.desc_act)
+        model = gptq_post_init(model, use_act_order=self.desc_act)
         if (
             self.desc_act
             and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
@@ -626,19 +757,14 @@ def pack_model(
             quantizers (`Dict[str,Tuple]`):
                 A mapping of the layer name and the data needed to pack the layer
         """
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
-            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
-        )
         logger.info("Packing model...")
         layers = get_layers(model)
         layers = {n: layers[n] for n in quantizers}
+
+        self.select_quant_linear(device_map=model.hf_device_map)
+
         self._replace_by_quant_layers(model, quantizers)
-        qlayers = get_layers(model, [QuantLinear])
+        qlayers = get_layers(model, [self.quant_linear])
         for name in qlayers:
             logger.info(name)
             quantizers[name], scale, zero, g_idx = quantizers[name]
@@ -673,6 +799,15 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
 
         """
+
+        # convert gptqmodel internal gptq_v2 format to v1 for max compatibility
+        if is_gptqmodel_available():
+            model, converted = hf_convert_gptq_v2_to_v1_format(
+                model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta
+            )
+            if converted:
+                self.checkpoint_format = "gptq"
+
         os.makedirs(save_dir, exist_ok=True)
         model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
         with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
@@ -736,10 +871,12 @@ def load_quantized_model(
     Returns:
         `nn.Module`: The quantized model
     """
-    if not torch.cuda.is_available():
-        raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
-    if not is_auto_gptq_available():
-        raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
+    if not torch.cuda.is_available() and not is_gptqmodel_available():
+        raise RuntimeError("No GPU found. A GPU is needed to run quantized model by auto_gptq.")
+    if not is_auto_gptq_available() and not is_gptqmodel_available():
+        raise RuntimeError(
+            "gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) is required in order to load quantized weights. Please notice that auto-gptq will be deprecated in the future."
+        )
     if not is_accelerate_available():
         raise RuntimeError(
             "You need to install accelerate in order to load and dispatch weights to"
@@ -777,7 +914,7 @@ def load_quantized_model(
     quantizer.exllama_version = quantizer.exllama_config["version"]
     quantizer.max_input_length = max_input_length
 
-    model = quantizer.convert_model(model)
+    model = quantizer.convert_model(model, device_map=device_map)
 
     if no_split_module_classes is None:
         no_split_module_classes = quantizer.get_no_split_module_classes(model)
diff --git a/optimum/gptq/utils.py b/optimum/gptq/utils.py
index a5f9afdaaef..732ecbd66b9 100644
--- a/optimum/gptq/utils.py
+++ b/optimum/gptq/utils.py
@@ -113,3 +113,18 @@ def get_seqlen(model: nn.Module):
         "We couldn't get the model sequence length. Setting it to 2048. You can overwrite this value by passing `model_seqlen` in` GPTQQuantizer`"
     )
     return 2048
+
+
+def move_to(obj: torch.Tensor, device: torch.device):
+    if get_device(obj) != device:
+        obj = obj.to(device)
+    return obj
+
+
+def nested_move_to(v, device):
+    if isinstance(v, torch.Tensor):
+        return move_to(v, device)
+    elif isinstance(v, (list, tuple)):
+        return type(v)([nested_move_to(e, device) for e in v])
+    else:
+        return v
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 2aa90253d08..e2b53a7dbc7 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -37,6 +37,7 @@
     is_auto_gptq_available,
     is_datasets_available,
     is_diffusers_available,
+    is_gptqmodel_available,
     is_onnx_available,
     is_onnxruntime_available,
     is_pydantic_available,
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 405e3815b33..d0f4c85db2b 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -52,6 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
 DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
 AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99")  # Allows 0.5.0.dev0
+GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.2")
 
 
 # This is the minimal required version to support some ONNX Runtime features
@@ -67,6 +68,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _accelerate_available = _is_package_available("accelerate")
 _diffusers_available = _is_package_available("diffusers")
 _auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
 _timm_available = _is_package_available("timm")
 _sentence_transformers_available = _is_package_available("sentence_transformers")
 _datasets_available = _is_package_available("datasets")
@@ -138,12 +140,23 @@ def is_datasets_available():
 
 def is_auto_gptq_available():
     if _auto_gptq_available:
-        version_autogptq = version.parse(importlib_metadata.version("auto_gptq"))
-        if AUTOGPTQ_MINIMUM_VERSION < version_autogptq:
+        v = version.parse(importlib_metadata.version("auto_gptq"))
+        if v >= AUTOGPTQ_MINIMUM_VERSION:
             return True
         else:
             raise ImportError(
-                f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, but only version above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+                f"Found an incompatible version of auto-gptq. Found version {v}, but only version >= {AUTOGPTQ_MINIMUM_VERSION} are supported"
+            )
+
+
+def is_gptqmodel_available():
+    if _gptqmodel_available:
+        v = version.parse(importlib_metadata.version("gptqmodel"))
+        if v >= GPTQMODEL_MINIMUM_VERSION:
+            return True
+        else:
+            raise ImportError(
+                f"Found an incompatible version of gptqmodel. Found version {v}, but only version >= {GPTQMODEL_MINIMUM_VERSION} are supported"
             )
 
 

From 34b3d8bdfebe94ca34d61d5aeadcbc49eee6f95d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:21:13 +0100
Subject: [PATCH 4/6] Skip private model loading for external contributors
 (#2130)

---
 tests/onnxruntime/test_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 255c0d9d0e7..456ad73505e 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -974,7 +974,7 @@ def test_stable_diffusion_model_on_rocm_ep_str(self):
     def test_load_model_from_hub_private(self):
         token = os.environ.get("HF_HUB_READ_TOKEN", None)
 
-        if token is None:
+        if not token:
             self.skipTest(
                 "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`."
             )

From 984012142a62b34300966da2a7c98e9e851bc6ee Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 20 Dec 2024 15:20:27 +0400
Subject: [PATCH 5/6] fix sdxl refiner export (#2133)

---
 optimum/exporters/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index d4a4111075d..02b1d0fe3af 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -139,7 +139,11 @@ def _get_submodels_for_export_diffusion(
         # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
         unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
         unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None)
-        unet.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim
+        unet.config.text_encoder_projection_dim = (
+            pipeline.text_encoder.config.projection_dim
+            if not is_sdxl
+            else pipeline.text_encoder_2.config.projection_dim
+        )
         unet.config.export_model_type = _get_diffusers_submodel_type(unet)
         models_for_export["unet"] = unet
 

From d21256c2964945fc3fe4623f7befb21082b69a25 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Fri, 20 Dec 2024 05:05:38 -0800
Subject: [PATCH 6/6] Export to ExecuTorch: Initial Integration (#2090)

Co-authored-by: Guang Yang <guangyang@fb.com>
Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>
Co-authored-by: Github Executorch <github_executorch@arm.com>
---
 .github/workflows/test_executorch_export.yml  |  35 ++
 .github/workflows/test_executorch_runtime.yml |  42 ++
 docs/Dockerfile                               |   4 +-
 docs/source/_toctree.yml                      |  17 +
 docs/source/exporters/executorch/overview.mdx |  26 +
 .../package_reference/configuration.mdx       |  54 ++
 .../executorch/package_reference/export.mdx   |  26 +
 .../executorch/usage_guides/contribute.mdx    |  57 +++
 .../usage_guides/export_a_model.mdx           | 124 +++++
 docs/source/exporters/overview.mdx            |   2 +-
 optimum/commands/__init__.py                  |   2 +-
 optimum/commands/export/__init__.py           |   1 +
 optimum/commands/export/base.py               |   6 +
 optimum/commands/export/executorch.py         |  67 +++
 optimum/executorchruntime/__init__.py         |  29 ++
 .../executorchruntime/modeling_executorch.py  | 460 ++++++++++++++++++
 optimum/exporters/__init__.py                 |   1 +
 optimum/exporters/executorch/__init__.py      |  50 ++
 optimum/exporters/executorch/__main__.py      | 160 ++++++
 optimum/exporters/executorch/convert.py       |  90 ++++
 .../exporters/executorch/recipe_registry.py   |  68 +++
 .../exporters/executorch/recipes/__init__.py  |  13 +
 .../exporters/executorch/recipes/xnnpack.py   |  97 ++++
 optimum/exporters/executorch/task_registry.py |  68 +++
 .../exporters/executorch/tasks/__init__.py    |  13 +
 .../exporters/executorch/tasks/causal_lm.py   |  66 +++
 setup.py                                      |   4 +
 tests/executorch/export/__init__.py           |  14 +
 .../export/test_exporters_executorch.py       | 115 +++++
 tests/executorch/runtime/__init__.py          |  14 +
 tests/executorch/runtime/test_modeling.py     |  70 +++
 .../executorch/runtime/test_modeling_gemma.py |  54 ++
 .../runtime/test_modeling_gemma2.py           |  56 +++
 .../executorch/runtime/test_modeling_llama.py |  83 ++++
 .../executorch/runtime/test_modeling_olmo.py  |  54 ++
 .../executorch/runtime/test_modeling_qwen2.py |  52 ++
 36 files changed, 2090 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/test_executorch_export.yml
 create mode 100644 .github/workflows/test_executorch_runtime.yml
 create mode 100644 docs/source/exporters/executorch/overview.mdx
 create mode 100644 docs/source/exporters/executorch/package_reference/configuration.mdx
 create mode 100644 docs/source/exporters/executorch/package_reference/export.mdx
 create mode 100644 docs/source/exporters/executorch/usage_guides/contribute.mdx
 create mode 100644 docs/source/exporters/executorch/usage_guides/export_a_model.mdx
 create mode 100644 optimum/commands/export/executorch.py
 create mode 100644 optimum/executorchruntime/__init__.py
 create mode 100644 optimum/executorchruntime/modeling_executorch.py
 create mode 100644 optimum/exporters/executorch/__init__.py
 create mode 100644 optimum/exporters/executorch/__main__.py
 create mode 100644 optimum/exporters/executorch/convert.py
 create mode 100644 optimum/exporters/executorch/recipe_registry.py
 create mode 100644 optimum/exporters/executorch/recipes/__init__.py
 create mode 100644 optimum/exporters/executorch/recipes/xnnpack.py
 create mode 100644 optimum/exporters/executorch/task_registry.py
 create mode 100644 optimum/exporters/executorch/tasks/__init__.py
 create mode 100644 optimum/exporters/executorch/tasks/causal_lm.py
 create mode 100644 tests/executorch/export/__init__.py
 create mode 100644 tests/executorch/export/test_exporters_executorch.py
 create mode 100644 tests/executorch/runtime/__init__.py
 create mode 100644 tests/executorch/runtime/test_modeling.py
 create mode 100644 tests/executorch/runtime/test_modeling_gemma.py
 create mode 100644 tests/executorch/runtime/test_modeling_gemma2.py
 create mode 100644 tests/executorch/runtime/test_modeling_llama.py
 create mode 100644 tests/executorch/runtime/test_modeling_olmo.py
 create mode 100644 tests/executorch/runtime/test_modeling_qwen2.py

diff --git a/.github/workflows/test_executorch_export.yml b/.github/workflows/test_executorch_export.yml
new file mode 100644
index 00000000000..1571cd0cffb
--- /dev/null
+++ b/.github/workflows/test_executorch_export.yml
@@ -0,0 +1,35 @@
+name: ExecuTorch Export / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        os: [macos-15]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for ExecuTorch
+        run: |
+          pip install .[tests,exporters-executorch]
+          pip list
+      - name: Run tests
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest executorch/export/test_*.py -s -vvvv --durations=0
diff --git a/.github/workflows/test_executorch_runtime.yml b/.github/workflows/test_executorch_runtime.yml
new file mode 100644
index 00000000000..d5bbc0f8eaa
--- /dev/null
+++ b/.github/workflows/test_executorch_runtime.yml
@@ -0,0 +1,42 @@
+name: ExecuTorch Runtime / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        os: [macos-15]
+        test-modeling:
+          - test_modeling_gemma2.py
+          - test_modeling_gemma.py
+          - test_modeling_llama.py
+          - test_modeling_olmo.py
+          - test_modeling.py
+          - test_modeling_qwen2.py
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for ExecuTorch
+        run: |
+          pip install .[tests,exporters-executorch]
+          pip list
+      - name: Run tests
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest executorch/runtime/${{ matrix.test-modeling }} -s -vvvv --durations=0
diff --git a/docs/Dockerfile b/docs/Dockerfile
index d76dc50c556..5181177f0db 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM nikolaik/python-nodejs:python3.9-nodejs18
+FROM nikolaik/python-nodejs:python3.11-nodejs23
 
 ARG commit_sha
 ARG clone_url
@@ -8,4 +8,4 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder.git
 
 RUN git clone $clone_url && cd optimum && git checkout $commit_sha
-RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-tf,doc-build,diffusers]
+RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-executorch,doc-build,diffusers]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 8444da1b9a9..dc69564b045 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -81,6 +81,23 @@
       title: Reference
       isExpanded: false
     title: "ONNX"
+  - sections:
+    - local: exporters/executorch/overview
+      title: Overview
+    - sections:
+      - local: exporters/executorch/usage_guides/export_a_model
+        title: Export a model to ExecuTorch
+      - local: exporters/executorch/usage_guides/contribute
+        title: Add support for exporting an architecture to ExecuTorch
+      title: How-to guides
+    - sections:
+      - local: exporters/executorch/package_reference/configuration
+        title: ExecuTorch configurations
+      - local: exporters/executorch/package_reference/export
+        title: Export functions
+      title: Reference
+      isExpanded: false
+    title: "ExecuTorch"
   - sections:
     - local: exporters/tflite/overview
       title: Overview
diff --git a/docs/source/exporters/executorch/overview.mdx b/docs/source/exporters/executorch/overview.mdx
new file mode 100644
index 00000000000..0e880968bf7
--- /dev/null
+++ b/docs/source/exporters/executorch/overview.mdx
@@ -0,0 +1,26 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Overview
+
+🤗 Optimum handles the export of PyTorch to ExecuTorch in the `exporters.executorch` module. It provides classes, functions, and a command line interface to perform the export easily.
+
+Supported architectures from [🤗 Transformers](https://huggingface.co/docs/transformers/index):
+
+- Gemma
+- Gemma2
+- Llama2
+- Llama3(Llama3.2)
+- OLMo
+- Qwen2(Qwen2.5)
+
+There are many more models are supported by ExecuTorch, we will add those models to Optimum over time. Read more at [pytorch/executorch/examples/](https://github.com/pytorch/executorch/tree/main/examples)
diff --git a/docs/source/exporters/executorch/package_reference/configuration.mdx b/docs/source/exporters/executorch/package_reference/configuration.mdx
new file mode 100644
index 00000000000..b7a10b80419
--- /dev/null
+++ b/docs/source/exporters/executorch/package_reference/configuration.mdx
@@ -0,0 +1,54 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Configuration for ExecuTorch Export
+
+ExecuTorch export provides a flexible configuration mechanism through dynamic registration, enabling users to have
+complete control over the export process. The configuration system is divided into task configurations and recipe
+configurations, each addressing specific aspects of the export pipeline.
+
+
+## Task Configurations
+
+Task configurations determine how a Hugging Face model should be loaded and prepared for export, tailored to specific tasks.
+
+For instance, when exporting a model for a text generation task, the provided configuration utilizes **static caching** and
+**SDPA (Scaled Dot-Product Attention)** for inference optimization.
+
+By leveraging task configurations, users can ensure that their models are appropriately prepared for efficient execution on
+the ExecuTorch backend.
+
+[[autodoc]] exporters.executorch.task_registry.discover_tasks
+
+[[autodoc]] exporters.executorch.task_registry.register_task
+
+[[autodoc]] exporters.executorch.tasks.causal_lm.load_causal_lm_model
+
+
+## Recipe Configurations
+
+Recipe configurations control the specifics of lowering an eager PyTorch module to the ExecuTorch backend. These
+configurations allow users to:
+
+- Specify whether and how to **quantize** the model.
+- Delegate computation to various accelerators, such as **CPU**, **GPU**, **NPU**, **DSP**, and others.
+- Define **custom transformation passes**.
+- Implement advanced techniques like memory planning algorithms to optimize resource utilization.
+
+[[autodoc]] exporters.executorch.recipe_registry.discover_recipes
+
+[[autodoc]] exporters.executorch.recipe_registry.register_recipe
+
+[[autodoc]] exporters.executorch.recipes.xnnpack.export_to_executorch_with_xnnpack
+
+The combination of task and recipe configurations ensures that users can customize both the high-level task setup
+and the low-level export details to suit their deployment requirements.
diff --git a/docs/source/exporters/executorch/package_reference/export.mdx b/docs/source/exporters/executorch/package_reference/export.mdx
new file mode 100644
index 00000000000..6663eb5278e
--- /dev/null
+++ b/docs/source/exporters/executorch/package_reference/export.mdx
@@ -0,0 +1,26 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Export functions
+
+## Main functions
+
+[[autodoc]] exporters.executorch.convert.export_to_executorch
+
+The primary export function is designed to be **model- and task-independent** as well as **optimization-agnostic**, providing a
+highly flexible and modular interface for exporting Hugging Face models to the ExecuTorch backend.
+
+This approach highlights the **composability** of ExecuTorch export pipeline, where dynamically registered **task configurations**
+specify how a :hug model is prepared, and **recipe configurations** encapsulate device-specific optimizations during export. This
+separation allows users to customize the export process without altering the core function.
+
+For more details on task and recipe configurations, see the [Configuration for ExecuTorch Export](./configuration.mdx).
diff --git a/docs/source/exporters/executorch/usage_guides/contribute.mdx b/docs/source/exporters/executorch/usage_guides/contribute.mdx
new file mode 100644
index 00000000000..2c6c1593169
--- /dev/null
+++ b/docs/source/exporters/executorch/usage_guides/contribute.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Adding support for an unsupported architecture
+
+We welcome contributions to extend the functionality of ExecuTorch export. This guide provides high-level instructions for contributors who want to:
+
+1. Export a new model that is not currently supported.
+2. Add new recipes or support a new task for export.
+
+---
+
+## Exporting a New Model
+
+If you want to export a model that is not already supported by the library, follow these steps:
+
+### Step 1: Export and Test the Model
+1. Attempt to export and lower the model using an existing task and recipe. On success, it will store the exported model in a `.pte` file.
+2. Add a test case for the model in the appropriate test suite.
+   - For example, you can make sure tests pass for the new `my_new_model` by running:
+     ```bash
+     pytest tests/executorch/export/test_*.py -k "test_my_new_model"  # doctest: +SKIP
+     pytest tests/executorch/runtime/test_*.py -k "test_my_new_model"  # doctest: +SKIP
+     ```
+
+### Step 2: Handle Export Failures
+1. If the export fails in Step 1, report the issue by opening a GitHub issue.
+2. If the issue requires changes to the model’s architecture or its Hugging Face implementation, these modifications may be made upstream in the Hugging Face Transformers library.
+
+---
+
+## Adding New Recipes or Tasks
+
+To extend ExecuTorch with new recipes or tasks, follow these guidelines:
+
+### Registering a New Recipe
+You can add a custom recipe to define specific optimizations or configurations for exporting models. Below is an example:
+
+```python
+from exporters.executorch import register_recipe
+
+@register_recipe("my_custom_recipe")
+def export_with_custom_recipe(model, config, *args, **kwargs):
+    # Example: Apply a custom quantization
+```
+
+### Registering a Task
+The task registration process is same as adding a recipe. Besides that you may need to implement a new `ExecuTorchModelForXXX` class.
diff --git a/docs/source/exporters/executorch/usage_guides/export_a_model.mdx b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx
new file mode 100644
index 00000000000..7993188cbd5
--- /dev/null
+++ b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx
@@ -0,0 +1,124 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Export a model to ExecuTorch with optimum.exporters.executorch
+
+If you need to deploy 🤗 Transformers models for on-device use cases, we recommend
+exporting them to a serialized format that can be distributed and executed on specialized
+runtimes and hardware. In this guide, we'll show you how to export these
+models to [ExecuTorch](https://pytorch.org/executorch/main/intro-overview.html).
+
+
+## Why ExecuTorch?
+
+ExecuTorch is the ideal solution for deploying PyTorch models on edge devices, offering a streamlined process from
+export to deployment without leaving PyTorch ecosystem.
+
+Supporting on-device AI presents unique challenges with diverse hardware, critical power requirements, low/no internet
+connectivity, and realtime processing needs. These constraints have historically prevented or slowed down the creation
+of scalable and performant on-device AI solutions. We designed ExecuTorch, backed by our industry partners like Meta,
+Arm, Apple, Qualcomm, MediaTek, etc. to be highly portable and provide superior developer productivity without losing on
+performance.
+
+
+## Summary
+
+Exporting a PyTorch model to ExecuTorch is as simple as
+
+```bash
+optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b"
+```
+
+Check out the help for more options:
+
+```bash
+optimum-cli export executorch --help
+```
+
+
+## Exporting a model to ExecuTorch using the CLI
+
+To export a 🤗 Transformers model to ExecuTorch, you'll first need to install some extra
+dependencies:
+
+```bash
+pip install optimum[exporters-executorch]
+```
+
+The Optimum ExecuTorch export can be used through Optimum command-line:
+
+```bash
+optimum-cli export executorch --help
+
+usage: optimum-cli export executorch [-h] -m MODEL [-o OUTPUT_DIR] [--task TASK] [--recipe RECIPE]
+
+options:
+  -h, --help            show this help message and exit
+
+Required arguments:
+  -m MODEL, --model MODEL
+                        Model ID on huggingface.co or path on disk to load model from.
+  -o OUTPUT_DIR, --output_dir OUTPUT_DIR
+                        Path indicating the directory where to store the generated ExecuTorch model.
+  --task TASK           The task to export the model for. Available tasks depend on the model, but are among: ['audio-classification', 'feature-extraction', 'image-to-text',
+                        'sentence-similarity', 'depth-estimation', 'image-segmentation', 'audio-frame-classification', 'masked-im', 'semantic-segmentation', 'text-classification',
+                        'audio-xvector', 'mask-generation', 'question-answering', 'text-to-audio', 'automatic-speech-recognition', 'image-to-image', 'multiple-choice', 'image-
+                        classification', 'text2text-generation', 'token-classification', 'object-detection', 'zero-shot-object-detection', 'zero-shot-image-classification', 'text-
+                        generation', 'fill-mask'].
+  --recipe RECIPE       Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack".
+
+```
+
+Exporting a checkpoint can be done as follows:
+
+```bash
+optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b"
+```
+
+You should see a `model.pte` file is stored under "./meta_llama3_2_1b/":
+
+```bash
+meta_llama3_2_1b/
+└── model.pte
+```
+
+This will fetch the model on the Hub and exports the PyTorch model with the specialized recipe. The resulting `model.pte` file can then be run on the [XNNPACK backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html), or on many
+other ExecuTorh supported backends if exports with different recipes, e.g. Apple's [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) or [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [Qualcomm's SoCs](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html), [ARM's Ethos-U](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html), [Xtensa HiFi4 DSP](https://pytorch.org/executorch/main/build-run-xtensa.html), [Vulkan GPU](https://pytorch.org/executorch/main/build-run-vulkan.html), [MediaTek](https://pytorch.org/executorch/main/build-run-mediatek-backend.html), etc.
+
+For example, we can load and run the model with [ExecuTorch
+Runtime](https://pytorch.org/executorch/main/runtime-overview.html) using the `optimum.executorchruntime` package as follows:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")  # doctest: +SKIP
+>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta_llama3_2_1b/", export=False)  # doctest: +SKIP
+
+>>> generated_text = model.text_generation(tokenizer=tokenizer, prompt="Simply put, the theory of relativity states that", max_seq_len=45)  # doctest: +SKIP
+```
+
+Printing the `generated_text` would give that:
+
+```
+"Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference. In other words, the laws of physics are the same in all inertial frames of reference."
+```
+
+As you can see, converting a model to ExecuTorch does not mean leaving the Hugging Face ecosystem. You end up with a similar API as regular 🤗 Transformers models!
+
+It is also possible to export the model to ExecuTorch directly from the `ExecuTorchModelForCausalLM` class by doing the following:
+
+```python
+>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", export=True, task="text-generation", recipe="xnnpack")
+```
diff --git a/docs/source/exporters/overview.mdx b/docs/source/exporters/overview.mdx
index 6fd7bd9d916..2b4c2e11792 100644
--- a/docs/source/exporters/overview.mdx
+++ b/docs/source/exporters/overview.mdx
@@ -12,4 +12,4 @@ specific language governing permissions and limitations under the License.
 
 # Overview
 
-🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, two exporting format are supported: ONNX and TFLite (TensorFlow Lite).
+🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, three exporting format are supported: ONNX, TFLite (TensorFlow Lite), and ExecuTorch.
diff --git a/optimum/commands/__init__.py b/optimum/commands/__init__.py
index 8a2a276d1c5..a31344ed133 100644
--- a/optimum/commands/__init__.py
+++ b/optimum/commands/__init__.py
@@ -14,5 +14,5 @@
 
 from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand
 from .env import EnvironmentCommand
-from .export import ExportCommand, ONNXExportCommand, TFLiteExportCommand
+from .export import ExecuTorchExportCommand, ExportCommand, ONNXExportCommand, TFLiteExportCommand
 from .optimum_cli import optimum_cli_subcommand
diff --git a/optimum/commands/export/__init__.py b/optimum/commands/export/__init__.py
index 19da68a60d2..b72cd5dbc8d 100644
--- a/optimum/commands/export/__init__.py
+++ b/optimum/commands/export/__init__.py
@@ -14,5 +14,6 @@
 
 
 from .base import ExportCommand
+from .executorch import ExecuTorchExportCommand
 from .onnx import ONNXExportCommand
 from .tflite import TFLiteExportCommand
diff --git a/optimum/commands/export/base.py b/optimum/commands/export/base.py
index 07737cb8eaf..e5ed4c90ff5 100644
--- a/optimum/commands/export/base.py
+++ b/optimum/commands/export/base.py
@@ -15,6 +15,7 @@
 """optimum.exporters command-line interface base classes."""
 
 from .. import BaseOptimumCLICommand, CommandInfo
+from .executorch import ExecuTorchExportCommand
 from .onnx import ONNXExportCommand
 from .tflite import TFLiteExportCommand
 
@@ -25,6 +26,11 @@ class ExportCommand(BaseOptimumCLICommand):
         help="Export PyTorch and TensorFlow models to several format.",
     )
     SUBCOMMANDS = (
+        CommandInfo(
+            name="executorch",
+            help="Export PyTorch model to ExecuTorch.",
+            subcommand_class=ExecuTorchExportCommand,
+        ),
         CommandInfo(
             name="onnx",
             help="Export PyTorch and TensorFlow to ONNX.",
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
new file mode 100644
index 00000000000..2bf2f1d3054
--- /dev/null
+++ b/optimum/commands/export/executorch.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""Defines the command line for the export with ExecuTorch."""
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from ...exporters import TasksManager
+from ..base import BaseOptimumCLICommand
+
+
+if TYPE_CHECKING:
+    from argparse import ArgumentParser
+
+
+def parse_args_executorch(parser):
+    required_group = parser.add_argument_group("Required arguments")
+    required_group.add_argument(
+        "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
+    )
+    required_group.add_argument(
+        "-o",
+        "--output_dir",
+        type=Path,
+        help="Path indicating the directory where to store the generated ExecuTorch model.",
+    )
+    required_group.add_argument(
+        "--task",
+        type=str,
+        default="text-generation",
+        help=(
+            "The task to export the model for. Available tasks depend on the model, but are among:"
+            f" {str(TasksManager.get_all_tasks())}."
+        ),
+    )
+    required_group.add_argument(
+        "--recipe",
+        type=str,
+        default="xnnpack",
+        help='Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack".',
+    )
+
+
+class ExecuTorchExportCommand(BaseOptimumCLICommand):
+    @staticmethod
+    def parse_args(parser: "ArgumentParser"):
+        return parse_args_executorch(parser)
+
+    def run(self):
+        from ...exporters.executorch import main_export
+
+        main_export(
+            model_name_or_path=self.args.model,
+            task=self.args.task,
+            recipe=self.args.recipe,
+            output_dir=self.args.output_dir,
+        )
diff --git a/optimum/executorchruntime/__init__.py b/optimum/executorchruntime/__init__.py
new file mode 100644
index 00000000000..0a84c3a139b
--- /dev/null
+++ b/optimum/executorchruntime/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "modeling_executorch": [
+        "ExecuTorchModelForCausalLM",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .modeling_executorch import ExecuTorchModelForCausalLM
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/optimum/executorchruntime/modeling_executorch.py b/optimum/executorchruntime/modeling_executorch.py
new file mode 100644
index 00000000000..b93309f6a48
--- /dev/null
+++ b/optimum/executorchruntime/modeling_executorch.py
@@ -0,0 +1,460 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""ExecuTorchModelForXXX classes, allowing to run ExecuTorch Models with ExecuTorch Runtime using the same API as Transformers."""
+
+import logging
+import os
+import warnings
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List, Optional, Union
+
+import torch
+from executorch.extension.pybindings.portable_lib import (
+    ExecuTorchModule,
+    _load_for_executorch,
+)
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers import (
+    AutoModelForCausalLM,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
+
+from ..exporters.executorch import main_export
+from ..modeling_base import OptimizedModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExecuTorchModelForCausalLM(OptimizedModel):
+    """
+    ExecuTorch model with a causal language modeling head for inference using the ExecuTorch Runtime.
+
+    This class provides an interface for loading, running, and generating outputs from a causal language model
+    optimized for ExecuTorch Runtime. It includes utilities for exporting and loading pre-trained models
+    compatible with ExecuTorch runtime.
+
+    Attributes:
+        auto_model_class (`Type`):
+            Associated Transformers class, `AutoModelForCausalLM`.
+        et_model (`ExecuTorchModule`):
+            The loaded ExecuTorch model.
+        use_kv_cache (`bool`):
+            Whether key-value caching is enabled. For performance reasons, the exported model is
+            optimized to use a static cache.
+        max_cache_size (`int`):
+            Maximum sequence length supported by the cache.
+        max_batch_size (`int`):
+            Maximum supported batch size.
+        dtype (`str`):
+            Data type of the model parameters.
+        bos_token_id (`int`):
+            Beginning-of-sequence token ID.
+        eos_token_id (`int`):
+            End-of-sequence token ID.
+        vocab_size (`int`):
+            Size of the model vocabulary.
+    """
+
+    auto_model_class = AutoModelForCausalLM
+
+    def __init__(
+        self,
+        model: "ExecuTorchModule",
+        config: "PretrainedConfig",
+    ):
+        super().__init__(model, config)
+        self.et_model = model
+        metadata = self.et_model.method_names()
+        logging.info(f"Load all static methods: {metadata}")
+        if "use_kv_cache" in metadata:
+            self.use_kv_cache = self.et_model.run_method("use_kv_cache")[0]
+        if "get_max_seq_len" in metadata:
+            self.max_cache_size = self.et_model.run_method("get_max_seq_len")[0]
+        if "get_max_batch_size" in metadata:
+            self.max_batch_size = self.et_model.run_method("get_max_batch_size")[0]
+        if "get_dtype" in metadata:
+            self.dtype = self.et_model.run_method("get_dtype")[0]
+        if "get_bos_id" in metadata:
+            self.bos_token_id = self.et_model.run_method("get_bos_id")[0]
+        if "get_eos_id" in metadata:
+            self.eos_token_id = self.et_model.run_method("get_eos_id")[0]
+        if "get_vocab_size" in metadata:
+            self.vocab_size = self.et_model.run_method("get_vocab_size")[0]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the model, which is compatible with the ExecuTorch runtime for LLM.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the model.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        return self.et_model.forward((input_ids, cache_position))[0]
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: Union[str, Path],
+        export: bool = True,
+        task: str = "",
+        recipe: str = "",
+        config: "PretrainedConfig" = None,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+        **kwargs,
+    ) -> "ExecuTorchModelForCausalLM":
+        """
+        Load a pre-trained ExecuTorch model.
+
+        Args:
+            model_name_or_path (`Union[str, Path]`):
+                Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`.
+            export (`bool`, *optional*, defaults to `True`):
+                If `True`, the model will be exported from eager to ExecuTorch after fetched from huggingface.co. `model_name_or_path` must be a valid model ID on huggingface.co.
+                If `False`, the previously exported ExecuTorch model will be loaded from a local path. `model_name_or_path` must be a valid local directory where a `model.pte` is stored.
+            task (`str`, defaults to `""`):
+                The task to export the model for, e.g. "text-generation". It is required to specify a task when `export` is `True`.
+            recipe (`str`, defaults to `""`):
+                The recipe to use to do the export, e.g. "xnnpack". It is required to specify a task when `export` is `True`.
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+            **kwargs:
+                Additional configuration options to tasks and recipes.
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: An instance of the ExecuTorch model for text generation task.
+        """
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+            token = use_auth_token
+
+        if export:
+            # Fetch the model from huggingface.co and export it to ExecuTorch
+            if task == "":
+                raise ValueError("Please specify a task to export the model for.")
+            if recipe == "":
+                raise ValueError("Please specify a recipe to export the model for.")
+            return cls._export(
+                model_id=model_name_or_path,
+                task=task,
+                recipe=recipe,
+                config=config,
+                **kwargs,
+            )
+        else:
+            # Load the ExecuTorch model from a local path
+            return cls._from_pretrained(
+                model_dir_path=model_name_or_path,
+                config=config,
+            )
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_dir_path: Union[str, Path],
+        config: PretrainedConfig,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+    ) -> "ExecuTorchModelForCausalLM":
+        """
+        Load a pre-trained ExecuTorch model from a local directory.
+
+        Args:
+            model_dir_path (`Union[str, Path]`):
+                Path to the directory containing the ExecuTorch model file (`model.pte`).
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: The initialized ExecuTorch model.
+
+        """
+        full_path = os.path.join(f"{model_dir_path}", "model.pte")
+        model = _load_for_executorch(full_path)
+        logging.info(f"Loaded model from {full_path}")
+        logging.debug(f"{model.method_meta('forward')}")
+        return cls(
+            model=model,
+            config=config,
+        )
+
+    def _save_pretrained(self, save_directory):
+        """
+        Saves a model weights into a directory, so that it can be re-loaded using the
+        [`from_pretrained`] class method.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def _export(
+        cls,
+        model_id: str,
+        task: str,
+        recipe: str,
+        config: PretrainedConfig,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        trust_remote_code: bool = False,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+        **kwargs,
+    ):
+        """
+        Fetch a model from the Hugging Face Hub and export it to ExecuTorch format.
+
+        Args:
+            model_id (`str`):
+                Model ID on huggingface.co, for example: `model_name_or_path="meta-llama/Llama-3.2-1B"`.
+            task (`str`):
+                The task to export the model for, e.g. "text-generation".
+            recipe (`str`):
+                The recipe to use to do the export, e.g. "xnnpack".
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            trust_remote_code (`bool`, defaults to `False`):
+                Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+                you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+                model repository.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+            **kwargs:
+                Additional configuration options to tasks and recipes.
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: The loaded and exported ExecuTorch model.
+
+        """
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+            token = use_auth_token
+
+        save_dir = TemporaryDirectory()
+        save_dir_path = Path(save_dir.name)
+
+        # Export to ExecuTorch and save the pte file to the temporary directory
+        main_export(
+            model_name_or_path=model_id,
+            output_dir=save_dir_path,
+            task=task,
+            recipe=recipe,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+        return cls._from_pretrained(
+            model_dir_path=save_dir_path,
+            config=config,
+            use_auth_token=use_auth_token,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+        )
+
+    def generate(
+        self,
+        prompt_tokens: List[int],
+        echo: bool = False,
+        pos_base: int = 0,
+        max_seq_len: Optional[int] = None,
+    ) -> List[int]:
+        """
+        Generate tokens from a prompt using the ExecuTorch model.
+
+        Args:
+            prompt_tokens (List[int]):
+                List of token IDs representing the prompt.
+            echo (`bool`, *optional*):
+                Whether to include prompt tokens in the generated output. Defaults to `False`.
+            pos_base (`int`, *optional*):
+                Base position for the prompt tokens. Defaults to 0.
+            max_seq_len (`int`, *optional*):
+                Maximum sequence length for the generated output.
+                Defaults to None and uses the model's `max_cache_size` attribute.
+                Will be truncated to maximal cache size if larger than `max_cache_size`.
+
+        Returns:
+            List[int]: List of generated token IDs.
+
+        Note:
+            Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind.
+            Expect improvements to the pybind interface in ExecuTorch version 0.4.1.
+        """
+        self.device = torch.device("cpu")
+        if max_seq_len is None:
+            # Default to max_cache_size if max_seq_len is not specified
+            max_seq_len = self.max_cache_size
+        elif max_seq_len > self.max_cache_size:
+            logging.warning(
+                f"max_seq_len={max_seq_len} is larger than max_cache_size={self.max_cache_size}. Generating tokens will be truncated to max_cache_size."
+            )
+            max_seq_len = self.max_cache_size
+        generated_tokens = []
+
+        # prefill
+        for i, prompt_token in enumerate(prompt_tokens):
+            logits = self.forward(
+                input_ids=torch.tensor([prompt_token], dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.tensor([i], dtype=torch.long, device=self.device),
+            )
+
+        next_token = torch.argmax(logits, dim=-1).item()
+        generated_tokens = prompt_tokens + [next_token]
+
+        while len(generated_tokens) < max_seq_len:
+            logits = self.forward(
+                input_ids=torch.tensor([next_token], dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.tensor(
+                    [pos_base + len(generated_tokens) - 1],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+            )
+            next_token = torch.argmax(logits, dim=-1).item()
+            generated_tokens.append(next_token)
+            if next_token == self.eos_token_id:
+                break
+
+        return generated_tokens if echo else generated_tokens[len(prompt_tokens) :]
+
+    def text_generation(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        prompt: str,
+        echo: bool = True,
+        max_seq_len: Optional[int] = None,
+    ):
+        """
+        Perform text generation task for a given prompt using the ExecuTorch model.
+
+        Args:
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer used to encode and decode the prompt and output.
+            prompt (`str`):
+                The text prompt to complete.
+            echo (`bool`, *optional*):
+                Whether to include prompt tokens in the generated output. Defaults to `True`.
+            max_seq_len (`int`, *optional*):
+                Maximum sequence length for the generated output.
+                Defaults to None and uses the model's `max_cache_size` attribute.
+                Will be truncated to maximal cache size if larger than `max_cache_size`.
+        """
+        self.tokenizer = tokenizer
+
+        # Sanity check
+        if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.bos_token_id:
+            raise ValueError(
+                f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
+            )
+        if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id != self.eos_token_id:
+            raise ValueError(
+                f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must be the same as the model's eos_token_id={self.eos_token_id}."
+            )
+
+        prompt_tokens = self.tokenizer.encode(prompt)
+        generated_tokens = self.generate(
+            prompt_tokens=prompt_tokens,
+            echo=echo,
+            max_seq_len=max_seq_len,
+        )
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
diff --git a/optimum/exporters/__init__.py b/optimum/exporters/__init__.py
index eef17dac7f7..7b08812a569 100644
--- a/optimum/exporters/__init__.py
+++ b/optimum/exporters/__init__.py
@@ -13,4 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import onnx  # noqa
+from . import executorch  # noqa
 from .tasks import TasksManager  # noqa
diff --git a/optimum/exporters/executorch/__init__.py b/optimum/exporters/executorch/__init__.py
new file mode 100644
index 00000000000..3409e69fcfb
--- /dev/null
+++ b/optimum/exporters/executorch/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "convert": [
+        "export_to_executorch",
+    ],
+    "recipe_registry": [
+        "discover_recipes",
+        "register_recipe",
+    ],
+    "task_registry": [
+        "discover_tasks",
+        "register_task",
+    ],
+    "tasks": [
+        "causal_lm",
+    ],
+    "recipes": [
+        "xnnpack",
+    ],
+    "__main__": ["main_export"],
+}
+
+if TYPE_CHECKING:
+    from .__main__ import main_export
+    from .convert import export_to_executorch
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py
new file mode 100644
index 00000000000..33a668b0674
--- /dev/null
+++ b/optimum/exporters/executorch/__main__.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""Entry point to the optimum.exporters.executorch command line."""
+
+import argparse
+import os
+import warnings
+from pathlib import Path
+
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers.utils import is_torch_available
+
+from optimum.utils.import_utils import check_if_transformers_greater
+
+from ...commands.export.executorch import parse_args_executorch
+from .convert import export_to_executorch
+from .task_registry import discover_tasks, task_registry
+
+
+if is_torch_available():
+    pass
+
+from typing import Optional, Union
+
+
+def main_export(
+    model_name_or_path: str,
+    task: str,
+    recipe: str,
+    output_dir: Union[str, Path],
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
+    trust_remote_code: bool = False,
+    pad_token_id: Optional[int] = None,
+    subfolder: str = "",
+    revision: str = "main",
+    force_download: bool = False,
+    local_files_only: bool = False,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **kwargs,
+):
+    """
+    Full-suite ExecuTorch export function, exporting **from a model ID on Hugging Face Hub or a local model repository**.
+
+    Args:
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`.
+        task (`str`):
+            The task to export the model for, e.g. "text-generation".
+        recipe (`str`):
+            The recipe to use to do the export, e.g. "xnnpack".
+        output_dir (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ExecuTorch model.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+            Deprecated. Please use the `token` argument instead.
+        token (`Optional[Union[bool,str]]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+        **kwargs:
+            Additional configuration options to tasks and recipes.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.executorch import main_export
+
+    >>> main_export("meta-llama/Llama-3.2-1B", "text-generation", "xnnpack", "meta_llama3_2_1b/")
+    ```
+    """
+
+    if not check_if_transformers_greater("4.46"):
+        raise ValueError(
+            "The minimum Transformers version compatible with ExecuTorch is 4.46.0. Please upgrade to Transformers 4.46.0 or later."
+        )
+
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+        token = use_auth_token
+
+    # Dynamically discover and import registered tasks
+    discover_tasks()
+
+    # Load the model for specific task
+    try:
+        task_func = task_registry.get(task)
+    except KeyError as e:
+        raise RuntimeError(f"The task '{task}' isn't registered. Detailed error: {e}")
+
+    model = task_func(model_name_or_path, **kwargs)
+
+    if task == "text-generation":
+        from transformers.integrations.executorch import TorchExportableModuleWithStaticCache
+
+        model = TorchExportableModuleWithStaticCache(model)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    return export_to_executorch(
+        model=model,
+        task=task,
+        recipe=recipe,
+        output_dir=output_dir,
+        **kwargs,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser("Hugging Face Optimum ExecuTorch exporter")
+
+    parse_args_executorch(parser)
+
+    # Retrieve CLI arguments
+    args = parser.parse_args()
+
+    main_export(
+        model_name_or_path=args.model,
+        output_dir=args.output_dir,
+        task=args.task,
+        recipe=args.recipe,
+        cache_dir=args.cache_dir,
+        trust_remote_code=args.trust_remote_code,
+        pad_token_id=args.pad_token_id,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
new file mode 100644
index 00000000000..f50a4b54a96
--- /dev/null
+++ b/optimum/exporters/executorch/convert.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""ExecuTorch model check and export functions."""
+
+import logging
+import os
+from pathlib import Path
+from typing import Union
+
+from transformers.utils import is_torch_available
+
+from optimum.utils.import_utils import check_if_transformers_greater
+
+from .recipe_registry import discover_recipes, recipe_registry
+
+
+if is_torch_available():
+    from transformers.modeling_utils import PreTrainedModel
+
+if check_if_transformers_greater("4.46"):
+    from transformers.integrations.executorch import (
+        TorchExportableModuleWithStaticCache,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+def export_to_executorch(
+    model: Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"],
+    task: str,
+    recipe: str,
+    output_dir: Union[str, Path],
+    **kwargs,
+):
+    """
+    Export a pre-trained PyTorch model to the ExecuTorch format using a specified recipe.
+
+    This function facilitates the transformation of a PyTorch model into an optimized ExecuTorch program.
+
+    Args:
+        model (`Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"]`):
+            A PyTorch model to be exported. This can be a standard HuggingFace `PreTrainedModel` or a wrapped
+            module like `TorchExportableModuleWithStaticCache` for text generation task.
+        task (`str`):
+            The specific task the exported model will perform, e.g., "text-generation".
+        recipe (`str`):
+            The recipe to guide the export process, e.g., "xnnpack". Recipes define the optimization and lowering steps.
+            Will raise an exception if the specified recipe is not registered in the recipe registry.
+        output_dir (`Union[str, Path]`):
+            Path to the directory where the resulting ExecuTorch model will be saved.
+        **kwargs:
+            Additional configuration options passed to the recipe.
+
+    Returns:
+        `ExecuTorchProgram`:
+            The lowered ExecuTorch program object.
+
+    Notes:
+        - The function uses a dynamic recipe discovery mechanism to identify and import the specified recipe.
+        - The exported model is stored in the specified output directory with the fixed filename `model.pte`.
+        - The resulting ExecuTorch program is serialized and saved to the output directory.
+    """
+
+    # Dynamically discover and import registered recipes
+    discover_recipes()
+
+    # Export and lower the model to ExecuTorch with the recipe
+    try:
+        recipe_func = recipe_registry.get(recipe)
+    except KeyError as e:
+        raise RuntimeError(f"The recipe '{recipe}' isn't registered. Detailed error: {e}")
+
+    executorch_prog = recipe_func(model, task, **kwargs)
+
+    full_path = os.path.join(f"{output_dir}", "model.pte")
+    with open(full_path, "wb") as f:
+        executorch_prog.write_to_file(f)
+        logging.info(f"Saved exported program to {full_path}")
+
+    return executorch_prog
diff --git a/optimum/exporters/executorch/recipe_registry.py b/optimum/exporters/executorch/recipe_registry.py
new file mode 100644
index 00000000000..2eb728b7573
--- /dev/null
+++ b/optimum/exporters/executorch/recipe_registry.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import importlib
+import logging
+import pkgutil
+
+
+logger = logging.getLogger(__name__)
+
+recipe_registry = {}
+
+package_name = "optimum.exporters.executorch.recipes"
+
+
+def register_recipe(recipe_name):
+    """
+    Decorator to register a recipe for exporting and lowering an ExecuTorch model under a specific name.
+
+    Args:
+        recipe_name (`str`):
+            The name of the recipe to associate with a callable recipe.
+
+    Returns:
+        `Callable`:
+            The original function wrapped as a registered recipe.
+
+    Example:
+        ```python
+        @register_recipe("my_new_recipe")
+        def my_new_recipe(...):
+            ...
+        ```
+    """
+
+    def decorator(func):
+        recipe_registry[recipe_name] = func
+        return func
+
+    return decorator
+
+
+def discover_recipes():
+    """
+    Dynamically discovers and imports all recipe modules within the `optimum.exporters.executorch.recipes` package.
+
+    Ensures recipes under `./recipes` directory are dynamically loaded without requiring manual imports.
+
+    Notes:
+        New recipes **must** be added to the `./recipes` directory to be discovered and used by `main_export`.
+        Failure to do so will prevent dynamic discovery and registration. Recipes must also use the
+        `@register_recipe` decorator to be properly registered in the `recipe_registry`.
+    """
+    package = importlib.import_module(package_name)
+    package_path = package.__path__
+
+    for _, module_name, _ in pkgutil.iter_modules(package_path):
+        logger.info(f"Importing {package_name}.{module_name}")
+        importlib.import_module(f"{package_name}.{module_name}")
diff --git a/optimum/exporters/executorch/recipes/__init__.py b/optimum/exporters/executorch/recipes/__init__.py
new file mode 100644
index 00000000000..a2e21cf3970
--- /dev/null
+++ b/optimum/exporters/executorch/recipes/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from . import xnnpack
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
new file mode 100644
index 00000000000..d3b3a5d52aa
--- /dev/null
+++ b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.export._trace
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from torch.nn.attention import SDPBackend
+from transformers import PreTrainedModel, TorchExportableModuleWithStaticCache
+
+from ..recipe_registry import register_recipe
+
+
+@register_recipe("xnnpack")
+def export_to_executorch_with_xnnpack(
+    model: Union[PreTrainedModel, TorchExportableModuleWithStaticCache],
+    task: str,
+    **kwargs,
+):
+    """
+    Export a PyTorch model to ExecuTorch w/ delegation to XNNPACK backend.
+
+    This function also write metadata required by the ExecuTorch runtime to the model.
+
+    Args:
+        model (Union[PreTrainedModel, TorchExportableModuleWithStaticCache]):
+            The PyTorch model to be exported to ExecuTorch.
+        task (str):
+            The task name to export the model for (e.g., "text-generation").
+        **kwargs:
+            Additional keyword arguments for recipe-specific configurations.
+
+    Returns:
+        ExecuTorchProgram:
+            The exported and optimized program for ExecuTorch.
+    """
+    metadata = {}
+    if task == "text-generation":
+        example_input_ids = torch.tensor([[1]], dtype=torch.long)
+        example_cache_position = torch.tensor([0], dtype=torch.long)
+
+        def _get_constant_methods(model: PreTrainedModel):
+            metadata = {
+                "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
+                "get_bos_id": model.config.bos_token_id,
+                "get_eos_id": model.config.eos_token_id,
+                "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
+                "get_max_batch_size": model.generation_config.cache_config.batch_size,
+                "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
+                "get_n_kv_heads": model.config.num_key_value_heads,
+                "get_n_layers": model.config.num_hidden_layers,
+                "get_vocab_size": model.config.vocab_size,
+                "use_kv_cache": model.generation_config.use_cache,
+            }
+            return {k: v for k, v in metadata.items() if v is not None}
+
+        metadata = _get_constant_methods(model if isinstance(model, PreTrainedModel) else model.model)
+    else:
+        # TODO: Prepare model inputs for other tasks
+        raise ValueError(f"Unsupported task '{task}'.")
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+        exported_program = torch.export._trace._export(
+            model,
+            args=(example_input_ids,),
+            kwargs={"cache_position": example_cache_position},
+            pre_dispatch=False,
+            strict=True,
+        )
+
+        return to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[XnnpackPartitioner()],
+            compile_config=EdgeCompileConfig(
+                _skip_dim_order=True,
+            ),
+            constant_methods=metadata,
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=True,
+            ),
+        )
diff --git a/optimum/exporters/executorch/task_registry.py b/optimum/exporters/executorch/task_registry.py
new file mode 100644
index 00000000000..fdc34f0359a
--- /dev/null
+++ b/optimum/exporters/executorch/task_registry.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import importlib
+import logging
+import pkgutil
+
+
+logger = logging.getLogger(__name__)
+
+task_registry = {}
+
+package_name = "optimum.exporters.executorch.tasks"
+
+
+def register_task(task_name):
+    """
+    Decorator to register a task under a specific name.
+
+    Args:
+        task_name (`str`):
+            The name of the task to associate with a callable task.
+
+    Returns:
+        `Callable`:
+            The original function wrapped as a registered task.
+
+    Example:
+        ```python
+        @register_task("my_new_task")
+        def my_new_task(...):
+            ...
+        ```
+    """
+
+    def decorator(func):
+        task_registry[task_name] = func
+        return func
+
+    return decorator
+
+
+def discover_tasks():
+    """
+    Dynamically discovers and imports all task modules within the `optimum.exporters.executorch.tasks` package.
+
+    Ensures tasks under `./tasks` directory are dynamically loaded without requiring manual imports.
+
+    Notes:
+        New tasks **must** be added to the `./tasks` directory to be discovered and used by `main_export`.
+        Failure to do so will prevent dynamic discovery and registration. Tasks must also use the
+        `@register_task` decorator to be properly registered in the `task_registry`.
+    """
+    package = importlib.import_module(package_name)
+    package_path = package.__path__
+
+    for _, module_name, _ in pkgutil.iter_modules(package_path):
+        logger.info(f"Importing {package_name}.{module_name}")
+        importlib.import_module(f"{package_name}.{module_name}")
diff --git a/optimum/exporters/executorch/tasks/__init__.py b/optimum/exporters/executorch/tasks/__init__.py
new file mode 100644
index 00000000000..754a8241ca3
--- /dev/null
+++ b/optimum/exporters/executorch/tasks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from . import causal_lm
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
new file mode 100644
index 00000000000..b02da8b319e
--- /dev/null
+++ b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from transformers import AutoModelForCausalLM, GenerationConfig
+
+from ..task_registry import register_task
+
+
+@register_task("text-generation")
+def load_causal_lm_model(model_name_or_path: str, **kwargs):
+    """
+    Loads a causal language model for text generation and registers it under the task
+    'text-generation' using Hugging Face's AutoModelForCausalLM.
+
+    Args:
+        model_name_or_path (str):
+            Model ID on huggingface.co or path on disk to the model repository to export. For example:
+            `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`
+        **kwargs:
+            Additional configuration options for the model:
+                - dtype (str, optional):
+                    Data type for model weights (default: "float32").
+                    Options include "float16" and "bfloat16".
+                - attn_implementation (str, optional):
+                    Attention mechanism implementation (default: "sdpa").
+                - cache_implementation (str, optional):
+                    Cache management strategy (default: "static").
+                - max_length (int, optional):
+                    Maximum sequence length for generation (default: 2048).
+
+    Returns:
+        transformers.PreTrainedModel:
+            An instance of a model subclass (e.g., Llama, Gemma) with the configuration for exporting
+            and lowering to ExecuTorch.
+    """
+    device = "cpu"
+    batch_size = 1
+    dtype = kwargs.get("dtype", "float32")
+    attn_implementation = kwargs.get("attn_implementation", "sdpa")
+    cache_implementation = kwargs.get("cache_implementation", "static")
+    max_length = kwargs.get("max_length", 2048)
+
+    return AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        device_map=device,
+        torch_dtype=dtype,
+        attn_implementation=attn_implementation,
+        generation_config=GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_length,
+            },
+        ),
+    )
diff --git a/setup.py b/setup.py
index 28b6941ebe8..555580528fe 100644
--- a/setup.py
+++ b/setup.py
@@ -85,6 +85,10 @@
         "datasets<=2.16",
         "transformers>=4.36,<4.38",
     ],
+    "exporters-executorch": [
+        "executorch>=0.4.0",
+        "transformers>=4.46",
+    ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
     "openvino": "optimum-intel[openvino]>=1.18.0",
diff --git a/tests/executorch/export/__init__.py b/tests/executorch/export/__init__.py
new file mode 100644
index 00000000000..fdc02578672
--- /dev/null
+++ b/tests/executorch/export/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/executorch/export/test_exporters_executorch.py b/tests/executorch/export/test_exporters_executorch.py
new file mode 100644
index 00000000000..f2467105e4f
--- /dev/null
+++ b/tests/executorch/export/test_exporters_executorch.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import tempfile
+import unittest
+
+import pytest
+from transformers.testing_utils import slow
+
+
+class TestExportToExecuTorchCLI(unittest.TestCase):
+    def test_helps_no_raise(self):
+        subprocess.run(
+            "optimum-cli export executorch --help",
+            shell=True,
+            check=True,
+        )
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_1b_export_to_executorch(self):
+        model_id = "NousResearch/Llama-3.2-1B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_3b_export_to_executorch(self):
+        model_id = "NousResearch/Hermes-3-Llama-3.2-3B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_qwen2_5_export_to_executorch(self):
+        model_id = "Qwen/Qwen2.5-0.5B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma2_export_to_executorch(self):
+        model_id = "unsloth/gemma-2-2b-it"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma_export_to_executorch(self):
+        model_id = "weqweasdas/RM-Gemma-2B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_olmo_export_to_executorch(self):
+        model_id = "allenai/OLMo-1B-hf"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
diff --git a/tests/executorch/runtime/__init__.py b/tests/executorch/runtime/__init__.py
new file mode 100644
index 00000000000..fdc02578672
--- /dev/null
+++ b/tests/executorch/runtime/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/executorch/runtime/test_modeling.py b/tests/executorch/runtime/test_modeling.py
new file mode 100644
index 00000000000..c97b461403c
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_load_model_from_hub(self):
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path="NousResearch/Llama-3.2-1B",
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_load_model_from_local_path(self):
+        from optimum.exporters.executorch import main_export
+
+        model_id = "NousResearch/Llama-3.2-1B"
+        task = "text-generation"
+        recipe = "xnnpack"
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            # Export to a local dir
+            main_export(
+                model_name_or_path=model_id,
+                task=task,
+                recipe=recipe,
+                output_dir=tempdir,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/model.pte"))
+
+            # Load the exported model from a local dir
+            model = ExecuTorchModelForCausalLM.from_pretrained(
+                model_name_or_path=tempdir,
+                export=False,
+            )
+            self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+            self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/executorch/runtime/test_modeling_gemma.py b/tests/executorch/runtime/test_modeling_gemma.py
new file mode 100644
index 00000000000..0e4238bf8ee
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_gemma.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma_text_generation_with_xnnpack(self):
+        # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "google/gemma-2b"
+        model_id = "weqweasdas/RM-Gemma-2B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "Hello I am doing a project for my school and I need to write a report on the history of the United States."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Hello I am doing a project for my school",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_gemma2.py b/tests/executorch/runtime/test_modeling_gemma2.py
new file mode 100644
index 00000000000..22fe4ab60d7
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_gemma2.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma2_text_generation_with_xnnpack(self):
+        # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "google/gemma-2-2b"
+        model_id = "unsloth/gemma-2-2b-it"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Hello I am doing a project for my school and I need to make sure it is a great to be creative and I can!"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Hello I am doing a project for my school",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_llama.py b/tests/executorch/runtime/test_modeling_llama.py
new file mode 100644
index 00000000000..fb08a5615a5
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_llama.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_1b_text_generation_with_xnnpack(self):
+        # TODO: Switch to use meta-llama/Llama-3.2-1B once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "lama/Llama-3.2-1B"
+        model_id = "NousResearch/Llama-3.2-1B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skip(reason="OOMs with macos-15 CI instances on GH.")
+    def test_llama3_2_3b_text_generation_with_xnnpack(self):
+        # TODO: Switch to use meta-llama/Llama-3.2-3B once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "lama/Llama-3.2-3B"
+        model_id = "NousResearch/Hermes-3-Llama-3.2-3B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Simply put, the theory of relativity states that time is relative and can be affected "
+            "by an object's speed. This theory was developed by Albert Einstein in the early 20th "
+            "century. The theory has two parts"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_olmo.py b/tests/executorch/runtime/test_modeling_olmo.py
new file mode 100644
index 00000000000..aa57496f291
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_olmo.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_olmo_text_generation_with_xnnpack(self):
+        model_id = "allenai/OLMo-1B-hf"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Simply put, the theory of relativity states that the speed of light is the same in all directions."
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_qwen2.py b/tests/executorch/runtime/test_modeling_qwen2.py
new file mode 100644
index 00000000000..ef624a784ea
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_qwen2.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_qwen2_5_text_generation_with_xnnpack(self):
+        model_id = "Qwen/Qwen2.5-0.5B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "My favourite condiment is iced tea. I love it with my breakfast, my lunch"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="My favourite condiment is ",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)