From 5b90bd7c4f56f20f99bc32d2434b2c916bf13921 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 24 Apr 2024 11:34:18 +0200 Subject: [PATCH 01/31] Remove call to `apt update` before `apt purge` in the main doc build workflow (#1830) --- .github/workflows/build_main_documentation.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index f25ee611f6f..e2a90843aa3 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -57,7 +57,6 @@ jobs: - name: Free disk space run: | df -h - sudo apt-get update sudo apt-get purge -y '^apache.*' sudo apt-get purge -y '^imagemagick.*' sudo apt-get purge -y '^dotnet.*' From 8180375d293a8c7f18d7a2b80c7521e09c5f78a9 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 24 Apr 2024 11:39:17 +0200 Subject: [PATCH 02/31] Update github workflows (#1829) --- .github/workflows/dev_test_bettertransformer.yml | 4 ++-- .github/workflows/dev_test_dummy_inputs.yml | 4 ++-- .github/workflows/dev_test_fx.yml | 4 ++-- .github/workflows/dev_test_onnx.yml | 4 ++-- .github/workflows/dev_test_onnxruntime.yml | 4 ++-- .github/workflows/dev_test_optimum_common.yml | 4 ++-- .github/workflows/test_bettertransformer.yml | 2 +- .github/workflows/test_cli.yml | 2 +- .github/workflows/test_dummy_inputs.yml | 2 +- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 2 +- .github/workflows/test_optimum_common.yml | 2 +- 13 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml index 28add750ca0..e4c999ca6da 100644 --- a/.github/workflows/dev_test_bettertransformer.yml +++ b/.github/workflows/dev_test_bettertransformer.yml @@ -16,7 +16,7 @@ jobs: - 3.8 os: - ubuntu-20.04 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -35,4 +35,4 @@ jobs: - name: Test with unittest working-directory: tests run: | - python -m unittest discover -s bettertransformer -p test_*.py \ No newline at end of file + python -m unittest discover -s bettertransformer -p test_*.py diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml index f0afb16c57b..49baa49c418 100644 --- a/.github/workflows/dev_test_dummy_inputs.yml +++ b/.github/workflows/dev_test_dummy_inputs.yml @@ -17,7 +17,7 @@ jobs: - 3.9 os: - ubuntu-20.04 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -35,4 +35,4 @@ jobs: - name: Test with unittest working-directory: tests run: | - python -m unittest discover -s utils -p test_*.py \ No newline at end of file + python -m unittest discover -s utils -p test_*.py diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml index 7f007dfaafc..0b8633282f7 100644 --- a/.github/workflows/dev_test_fx.yml +++ b/.github/workflows/dev_test_fx.yml @@ -17,7 +17,7 @@ jobs: - 3.9 os: - ubuntu-20.04 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -35,4 +35,4 @@ jobs: - name: Test with unittest working-directory: tests run: | - python -m pytest fx/optimization/test_transformations.py --exitfirst \ No newline at end of file + python -m pytest fx/optimization/test_transformations.py --exitfirst diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml index 46b548b7ede..48052cfded3 100644 --- a/.github/workflows/dev_test_onnx.yml +++ b/.github/workflows/dev_test_onnx.yml @@ -17,7 +17,7 @@ jobs: - 3.9 os: - ubuntu-20.04 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -34,4 +34,4 @@ jobs: - name: Test with unittest working-directory: tests run: | - python -m unittest discover -s onnx -p test_*.py \ No newline at end of file + python -m unittest discover -s onnx -p test_*.py diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml index 9d097ad7255..857028ab2db 100644 --- a/.github/workflows/dev_test_onnxruntime.yml +++ b/.github/workflows/dev_test_onnxruntime.yml @@ -18,7 +18,7 @@ jobs: os: - ubuntu-20.04 - windows-2019 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -36,4 +36,4 @@ jobs: working-directory: tests run: | python -m pytest -n auto -m "not run_in_series" onnxruntime - python -m pytest -m "run_in_series" onnxruntime \ No newline at end of file + python -m pytest -m "run_in_series" onnxruntime diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml index bec011b5246..807ed0b1dab 100644 --- a/.github/workflows/dev_test_optimum_common.yml +++ b/.github/workflows/dev_test_optimum_common.yml @@ -19,7 +19,7 @@ jobs: os: - ubuntu-20.04 - windows-2019 - - macos-latest + - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -42,4 +42,4 @@ jobs: as the staging tests cannot run in parallel. export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == 3.8 && matrix.os == ubuntu-20.04 }} - python -m unittest discover -s tests -p test_*.py \ No newline at end of file + python -m unittest discover -s tests -p test_*.py diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml index dc57605ba7c..6607466dc22 100644 --- a/.github/workflows/test_bettertransformer.yml +++ b/.github/workflows/test_bettertransformer.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-latest] + os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index 665982fc4a6..7eae0186076 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-latest] + os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_dummy_inputs.yml b/.github/workflows/test_dummy_inputs.yml index a0c7e448ca7..60ca033843c 100644 --- a/.github/workflows/test_dummy_inputs.yml +++ b/.github/workflows/test_dummy_inputs.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-latest] + os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index edede4ad68b..2535f1b154d 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-latest] + os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 7cac9e4e829..5a21f12d015 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-latest] + os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 30d45d4d5ea..f173cc6c6bd 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, windows-2019, macos-latest] + os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index bd616e19585..ded149c9b69 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, windows-2019, macos-latest] + os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} steps: From a7296b45de0ea4b7e115543eb04043c729ea0ef3 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:08:19 +0200 Subject: [PATCH 03/31] Remove bad PPA in main doc build workflow (#1831) --- .github/workflows/build_main_documentation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index e2a90843aa3..f1f4a23262a 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -137,6 +137,8 @@ jobs: run: | cd optimum-furiosa pip install . + sudo apt install software-properties-common + sudo add-apt-repository --remove https://packages.microsoft.com/ubuntu/22.04/prod sudo apt update sudo apt install -y ca-certificates apt-transport-https gnupg sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 5F03AFA423A751913F249259814F888B20B09A7E From 3b5c486821628d1f1e4c34a59d097e5a5f4e1803 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Thu, 25 Apr 2024 12:13:10 +0200 Subject: [PATCH 04/31] Fix TPU doc build (#1834) --- .github/workflows/build_main_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index f1f4a23262a..20face917ab 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -161,7 +161,7 @@ jobs: sudo docker system prune -a -f cd optimum-tpu pip install -U pip - pip install . + pip install . -f https://storage.googleapis.com/libtpu-releases/index.html doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean mv tpu-doc-build ../optimum cd .. diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index c1fc4d859ce..e5f2dcb0d18 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -101,7 +101,7 @@ jobs: sudo docker system prune -a -f cd optimum-tpu pip install -U pip - pip install . + pip install . -f https://storage.googleapis.com/libtpu-releases/index.html doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean mv tpu-doc-build ../optimum cd .. From c55f8824f58db1a2f1cfc7879451b4743b8f206b Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:15:34 +0200 Subject: [PATCH 05/31] Fix infer library for sentence transformers models (#1832) --- optimum/exporters/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index ca71dca92a9..4ec641d6c14 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -1671,7 +1671,10 @@ def _infer_library_from_model( if library_name is not None: return library_name - if ( + # SentenceTransformer models have no config attributes + if hasattr(model, "_model_config"): + library_name = "sentence_transformers" + elif ( hasattr(model, "pretrained_cfg") or hasattr(model.config, "pretrained_cfg") or hasattr(model.config, "architecture") @@ -1679,8 +1682,6 @@ def _infer_library_from_model( library_name = "timm" elif hasattr(model.config, "_diffusers_version") or getattr(model, "config_name", "") == "model_index.json": library_name = "diffusers" - elif hasattr(model, "_model_config"): - library_name = "sentence_transformers" else: library_name = "transformers" return library_name @@ -1905,7 +1906,6 @@ def get_model_from_task( model_class = TasksManager.get_model_class_for_task( task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name ) - if library_name == "timm": model = model_class(f"hf_hub:{model_name_or_path}", pretrained=True, exportable=True) model = model.to(torch_dtype).to(device) From e3fd2776a318a3a7b9d33315cc42c04c181f6d2f Mon Sep 17 00:00:00 2001 From: B-201 <116639249+B-201@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:23:04 +0800 Subject: [PATCH 06/31] Fix bug causing random initialization of bias when using GPTQ quantization with models without bias (#1827) * Fix gptq quantization for models without bias * Fix gptq quantization for models without bias --- optimum/gptq/quantizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 289e3256825..2c2c9d7e71a 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -278,19 +278,20 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st elif isinstance(layer, Conv1D): in_features = layer.weight.shape[0] out_features = layer.weight.shape[1] + bias = layer.bias is not None if not (self.desc_act) or self.group_size == -1: new_layer = QuantLinear( self.bits, self.group_size, in_features, out_features, - True, + bias, use_cuda_fp16=self.use_cuda_fp16, weight_dtype=layer.weight.dtype, ) else: new_layer = QuantLinear( - self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype + self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype ) new_layer.device = device setattr(module, attr, new_layer.to(device)) From 189dd25aa4e247538102c3f48fedc3957304267a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 2 May 2024 18:50:11 +0200 Subject: [PATCH 07/31] Add onnx export for VITS architecture (#1607) * add onnx export for VITS architecture * fix style * set task --- optimum/exporters/onnx/model_configs.py | 19 +++++++++++++++++++ optimum/exporters/tasks.py | 4 ++++ tests/exporters/exporters_utils.py | 1 + 3 files changed, 24 insertions(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 72d047efa01..496957b2b5d 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1842,6 +1842,25 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire inputs_or_outputs[f"{name}.{i}.encoder.value"] = {2: "encoder_sequence_length_out"} +class VitsOnnxConfig(TextEncoderOnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + ATOL_FOR_VALIDATION = 1e-4 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "attention_mask": {0: "text_batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "waveform": {0: "text_batch_size", 1: "n_samples"}, + "spectrogram": {0: "text_batch_size", 2: "num_bins"}, + } + + class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): shape = [self.batch_size, self.sequence_length, self.normalized_config.input_features_per_channel] diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 4ec641d6c14..efa782353b4 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -1044,6 +1044,10 @@ class TasksManager: "vit": supported_tasks_mapping( "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig" ), + "vits": supported_tasks_mapping( + "text-to-audio", + onnx="VitsOnnxConfig", + ), "wavlm": supported_tasks_mapping( "feature-extraction", "automatic-speech-recognition", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index bc1d8a4a289..ab0b8488fb8 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -149,6 +149,7 @@ "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "vit": "hf-internal-testing/tiny-random-vit", + "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken "hubert": "hf-internal-testing/tiny-random-HubertModel", From db6db6fc6a0690bce501569ab384f1bf10a2c7da Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Thu, 9 May 2024 02:12:55 -0700 Subject: [PATCH 08/31] Add Phi-3 mini to Optimum (#1841) * Load config from folder * Add Phi-3 to normalized config --- optimum/modeling_base.py | 2 +- optimum/utils/normalized_config.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 9523f5c5042..9663a311692 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -358,7 +358,7 @@ def from_pretrained( if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)): config = AutoConfig.from_pretrained( - os.path.join(model_id, subfolder, CONFIG_NAME), trust_remote_code=trust_remote_code + os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code ) elif CONFIG_NAME in os.listdir(model_id): config = AutoConfig.from_pretrained( diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 8a5ef377854..a894001d359 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -252,6 +252,7 @@ class NormalizedConfigManager: "pegasus": BartLikeNormalizedTextConfig, "pix2struct": Pix2StructNormalizedTextConfig, "phi": NormalizedTextConfig, + "phi3": NormalizedTextConfigWithGQA, "poolformer": NormalizedVisionConfig, "regnet": NormalizedVisionConfig, "resnet": NormalizedVisionConfig, From b3ecb6c405b7fd5425d79483fd7dc88c0609be8e Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Thu, 9 May 2024 12:58:50 +0200 Subject: [PATCH 09/31] Update the Transformers dependency in the Habana extra (#1851) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7fac0a1c9d5..4e154819bc2 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,7 @@ "nncf": "optimum-intel[nncf]>=1.15.0", "neural-compressor": "optimum-intel[neural-compressor]>=1.15.0", "graphcore": "optimum-graphcore", - "habana": ["optimum-habana", "transformers >= 4.37.0, < 4.38.0"], + "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"], "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers == 4.36.2"], "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers == 4.36.2"], "furiosa": "optimum-furiosa", From 02c6ed5f413384d543bcf83a3a9094be2c0429a5 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 19:06:40 +0400 Subject: [PATCH 10/31] Make stable diffusion unet and vae number of channels static (#1840) --- optimum/exporters/onnx/model_configs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 496957b2b5d..d4c4ac934b9 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -981,7 +981,7 @@ class UNetOnnxConfig(VisionOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = { - "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + "sample": {0: "batch_size", 2: "height", 3: "width"}, "timestep": {0: "steps"}, "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, } @@ -998,7 +998,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "out_sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + "out_sample": {0: "batch_size", 2: "height", 3: "width"}, } @property @@ -1045,13 +1045,13 @@ class VaeEncoderOnnxConfig(VisionOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { - "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + "sample": {0: "batch_size", 2: "height", 3: "width"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "latent_sample": {0: "batch_size", 1: "num_channels_latent", 2: "height_latent", 3: "width_latent"}, + "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } @@ -1069,13 +1069,13 @@ class VaeDecoderOnnxConfig(VisionOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { - "latent_sample": {0: "batch_size", 1: "num_channels_latent", 2: "height_latent", 3: "width_latent"}, + "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + "sample": {0: "batch_size", 2: "height", 3: "width"}, } From e0f58121140ce4baa01919ad70a6c13e936f7605 Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Tue, 21 May 2024 19:13:36 +0200 Subject: [PATCH 11/31] Expand support (#1864) add --- optimum/utils/normalized_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index a894001d359..682f70e3ca3 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -253,6 +253,7 @@ class NormalizedConfigManager: "pix2struct": Pix2StructNormalizedTextConfig, "phi": NormalizedTextConfig, "phi3": NormalizedTextConfigWithGQA, + "phi3small": NormalizedTextConfigWithGQA, "poolformer": NormalizedVisionConfig, "regnet": NormalizedVisionConfig, "resnet": NormalizedVisionConfig, From cc9889b78ae00b474478c3933f730b56e68d7dbd Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 23 May 2024 15:58:55 +0200 Subject: [PATCH 12/31] Fix compatibility with transformers v4.41.0 for ONNX (#1860) * bump transformers * update default onnx opset * style * save export for model with invalid generation config * set minimum onnx opset * update setup --- optimum/exporters/onnx/convert.py | 7 ++- optimum/exporters/onnx/model_configs.py | 66 +++++++++++++++---------- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index f1122e43626..4d5a2afc374 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -1123,7 +1123,12 @@ def onnx_export_from_model( model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) if generation_config is not None: - generation_config.save_pretrained(output) + # since v4.41.0 an exceptions will be raised when saving a generation config considered invalid + # https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/generation/configuration_utils.py#L697 + try: + generation_config.save_pretrained(output) + except Exception as exception: + logger.warning(f"The generation config is invalid and will not be saved : {exception}") model_name_or_path = model.config._name_or_path maybe_save_preprocessors(model_name_or_path, output) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d4c4ac934b9..eb0ac9a4988 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -99,6 +99,7 @@ class BertOnnxConfig(TextEncoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -114,42 +115,44 @@ def inputs(self) -> Dict[str, Dict[int, str]]: class AlbertOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class ConvBertOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class ElectraOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class RoFormerOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class SqueezeBertOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class MobileBertOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class NystromformerOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class XLMOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class SplinterOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class DistilBertOnnxConfig(BertOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + @property def inputs(self) -> Dict[str, Dict[int, str]]: if self.task == "multiple-choice": @@ -172,7 +175,7 @@ class CamembertOnnxConfig(DistilBertOnnxConfig): class FlaubertOnnxConfig(BertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class IBertOnnxConfig(DistilBertOnnxConfig): @@ -195,6 +198,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: class MarkupLMOnnxConfig(BertOnnxConfig): + DEFAULT_ONNX_OPSET = 11 DUMMY_INPUT_GENERATOR_CLASSES = ( DummyTextInputGenerator, DummyXPathSeqInputGenerator, @@ -706,6 +710,7 @@ class MarianOnnxConfig(BartOnnxConfig): class ViTOnnxConfig(VisionOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig MIN_TORCH_VERSION = version.parse("1.11") + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -725,36 +730,38 @@ class CvTOnnxConfig(ViTOnnxConfig): class LevitOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class DeiTOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class BeitOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class ConvNextOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class ConvNextV2OnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class MobileViTOnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 11 class RegNetOnnxConfig(ViTOnnxConfig): # This config has the same inputs as ViTOnnxConfig - pass + DEFAULT_ONNX_OPSET = 11 class ResNetOnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-3 + DEFAULT_ONNX_OPSET = 11 class DetrOnnxConfig(ViTOnnxConfig): @@ -776,11 +783,11 @@ class TableTransformerOnnxConfig(DetrOnnxConfig): class YolosOnnxConfig(ViTOnnxConfig): - DEFAULT_ONNX_OPSET = 12 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class SwinOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class Swin2srOnnxConfig(SwinOnnxConfig): @@ -788,16 +795,17 @@ class Swin2srOnnxConfig(SwinOnnxConfig): class DptOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class GlpnOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class PoolFormerOnnxConfig(ViTOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig ATOL_FOR_VALIDATION = 2e-3 + DEFAULT_ONNX_OPSET = 11 class SegformerOnnxConfig(YolosOnnxConfig): @@ -806,6 +814,7 @@ class SegformerOnnxConfig(YolosOnnxConfig): class MobileNetV1OnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 11 @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -817,7 +826,7 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig): class DonutSwinOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class TimmDefaultOnnxConfig(ViTOnnxConfig): @@ -1191,12 +1200,13 @@ class Data2VecTextOnnxConfig(DistilBertOnnxConfig): class Data2VecVisionOnnxConfig(ViTOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class Data2VecAudioOnnxConfig(AudioOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedConfig ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class PerceiverDummyInputGenerator(DummyVisionInputGenerator): @@ -1292,18 +1302,19 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): class HubertOnnxConfig(AudioOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedConfig + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class Wav2Vec2OnnxConfig(HubertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class Wav2Vec2ConformerOnnxConfig(HubertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 11 class SEWOnnxConfig(HubertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class SEWDOnnxConfig(HubertOnnxConfig): @@ -1311,11 +1322,11 @@ class SEWDOnnxConfig(HubertOnnxConfig): class UniSpeechOnnxConfig(HubertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class UniSpeechSATOnnxConfig(HubertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class WavLMOnnxConfig(HubertOnnxConfig): @@ -1344,6 +1355,7 @@ class ASTOnnxConfig(OnnxConfig): ) DUMMY_INPUT_GENERATOR_CLASSES = (ASTDummyAudioInputGenerator,) ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. @property def inputs(self) -> Dict[str, Dict[int, str]]: From 7184ef4e720369ed75dcfa1404195fffb7b71aec Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Fri, 24 May 2024 15:53:06 +0200 Subject: [PATCH 13/31] Add phi3 support in ONNX exporter (#1870) * add phi3 support * add test * add phi3 to modeeling test * replace ckpt to avoid remote code * fix test * bump trfrs to fix test --------- Co-authored-by: Jingya --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 9 +++++++++ optimum/exporters/onnx/utils.py | 1 + optimum/exporters/tasks.py | 8 ++++++++ optimum/utils/__init__.py | 1 + setup.py | 2 +- tests/exporters/exporters_utils.py | 1 + tests/onnxruntime/test_modeling.py | 1 + tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 9 files changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 22471c297a5..747e1396fb4 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -77,6 +77,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Pegasus - Perceiver - Phi +- Phi3 - Pix2Struct - PoolFormer - Qwen2(Qwen1.5) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index eb0ac9a4988..e23716d4b74 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -51,6 +51,7 @@ NormalizedSeq2SeqConfig, NormalizedTextAndVisionConfig, NormalizedTextConfig, + NormalizedTextConfigWithGQA, NormalizedVisionConfig, is_diffusers_available, logging, @@ -291,6 +292,14 @@ class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +class Phi3OnnxConfig(PhiOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA + + class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 747cc687996..8ecba9231f6 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -84,6 +84,7 @@ "llama", "mistral", "phi", + "phi3", "qwen2", } diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index efa782353b4..608b3df0d7c 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -893,6 +893,14 @@ class TasksManager: "text-classification", onnx="PhiOnnxConfig", ), + "phi3": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + onnx="Phi3OnnxConfig", + ), "pix2struct": supported_tasks_mapping( "image-to-text", "image-to-text-with-past", diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 07be3f7e1a6..5d5044e63e1 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -80,5 +80,6 @@ NormalizedSeq2SeqConfig, NormalizedTextAndVisionConfig, NormalizedTextConfig, + NormalizedTextConfigWithGQA, NormalizedVisionConfig, ) diff --git a/setup.py b/setup.py index 4e154819bc2..407f6a2a3fb 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.26.0,<4.41.0", + "transformers[sentencepiece]>=4.26.0,<4.42.0", "torch>=1.11", "packaging", "numpy", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index ab0b8488fb8..0c52754ff60 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -132,6 +132,7 @@ "hf-internal-testing/tiny-random-vision_perceiver_conv": ["image-classification"], }, "phi": "echarlaix/tiny-random-PhiForCausalLM", + "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", # "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index dd2bc858c41..182e64beb90 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2258,6 +2258,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "llama", "mistral", "mpt", + "phi3", "qwen2", ] diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 9de5e495e3b..65298265780 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -128,6 +128,7 @@ "pegasus": "hf-internal-testing/tiny-random-PegasusModel", "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver", "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv", + "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen2": "fxmarty/tiny-dummy-qwen2", From d2fade2bf7fd95fef0addbcea62e6c597930df37 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 27 May 2024 16:38:34 +0200 Subject: [PATCH 14/31] Fix FX CI (#1866) * use gen_constructor_wrapper * use original wrapper generator and eager attn * force fx tracing --- .github/workflows/test_fx.yml | 33 ++++++++++--------- optimum/fx/optimization/transformations.py | 24 +++++++++++--- tests/fx/optimization/test_transformations.py | 3 +- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index 2535f1b154d..f0366cf0d1e 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -2,9 +2,9 @@ name: FX / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -20,16 +20,19 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install .[tests] - pip install git+https://github.com/huggingface/transformers.git - - name: Test with unittest - working-directory: tests - run: | - python -m pytest fx/optimization/test_transformations.py --exitfirst + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install .[tests] + + - name: Test with pytest + working-directory: tests + run: | + python -m pytest -s -v -x fx/optimization diff --git a/optimum/fx/optimization/transformations.py b/optimum/fx/optimization/transformations.py index 2013a063434..348a862db81 100644 --- a/optimum/fx/optimization/transformations.py +++ b/optimum/fx/optimization/transformations.py @@ -19,15 +19,31 @@ import operator import warnings from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List +from typing import List import torch +from torch.fx import GraphModule, Node from transformers.file_utils import add_end_docstrings -from transformers.utils.fx import _gen_constructor_wrapper -if TYPE_CHECKING: - from torch.fx import GraphModule, Node +try: + from transformers.utils.fx import _gen_constructor_wrapper +except ImportError: + from transformers.utils.fx import gen_constructor_wrapper + + def _gen_constructor_wrapper(*args, **kwargs): + wrapper, target = gen_constructor_wrapper(*args, **kwargs) + + def wrapper_with_forced_tracing(*_args, **_kwargs): + import torch.fx._symbolic_trace + + orginal_flag = torch.fx._symbolic_trace._is_fx_tracing_flag + torch.fx._symbolic_trace._is_fx_tracing_flag = True + out = wrapper(*_args, **_kwargs) + torch.fx._symbolic_trace._is_fx_tracing_flag = orginal_flag + return out + + return wrapper_with_forced_tracing, target _ATTRIBUTES_DOCSTRING = r""" diff --git a/tests/fx/optimization/test_transformations.py b/tests/fx/optimization/test_transformations.py index e6a77a13ffc..3aaa7fe6c69 100644 --- a/tests/fx/optimization/test_transformations.py +++ b/tests/fx/optimization/test_transformations.py @@ -86,7 +86,8 @@ def transform(self, graph_module): def get_bert_model(): - model = BertModel.from_pretrained(_MODEL_NAME) + # sdpa attn became default + model = BertModel.from_pretrained(_MODEL_NAME, attn_implementation="eager") model.eval() traced = symbolic_trace(model, input_names=["input_ids", "attention_mask", "token_type_ids"]) return model, traced From ff0a0b3ad13572df76bf13fead7cdfeafbd74c0f Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 27 May 2024 16:41:48 +0200 Subject: [PATCH 15/31] Fix Utils CI (#1867) * use smallest split * fix typing * use pytest * utils ci * test disk freeing action * test without freeing disk * disable caching * reduce num_samples * avoid downloading torch cuda binaries * add verbosity --- .github/workflows/test_dummy_inputs.yml | 37 ----------------- .github/workflows/test_utils.yml | 40 +++++++++++++++++++ .../preprocessing/task_processors_manager.py | 6 +-- tests/utils/test_dummpy_input_generators.py | 8 ++-- tests/utils/test_task_processors.py | 17 +++++++- 5 files changed, 62 insertions(+), 46 deletions(-) delete mode 100644 .github/workflows/test_dummy_inputs.yml create mode 100644 .github/workflows/test_utils.yml diff --git a/.github/workflows/test_dummy_inputs.yml b/.github/workflows/test_dummy_inputs.yml deleted file mode 100644 index 60ca033843c..00000000000 --- a/.github/workflows/test_dummy_inputs.yml +++ /dev/null @@ -1,37 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Dummy inputs / Python - Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build: - strategy: - fail-fast: false - matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-13] - - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install .[tests] - - name: Test with unittest - working-directory: tests - run: | - python -m unittest discover -s utils -p 'test_*.py' diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml new file mode 100644 index 00000000000..1ef33ced086 --- /dev/null +++ b/.github/workflows/test_utils.yml @@ -0,0 +1,40 @@ +name: Utils / Python - Test + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + strategy: + fail-fast: false + matrix: + os: [ubuntu-20.04, macos-13] + python-version: [3.8, 3.9] + + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[tests] + + - name: Test with pytest + working-directory: tests + run: | + python -m pytest -s -vvvv utils diff --git a/optimum/utils/preprocessing/task_processors_manager.py b/optimum/utils/preprocessing/task_processors_manager.py index 2720ed41fbb..0426d1a2b43 100644 --- a/optimum/utils/preprocessing/task_processors_manager.py +++ b/optimum/utils/preprocessing/task_processors_manager.py @@ -23,7 +23,7 @@ if TYPE_CHECKING: - from .base import DatasetProcessing + from .base import TaskProcessor class TaskProcessorsManager: @@ -35,7 +35,7 @@ class TaskProcessorsManager: } @classmethod - def get_task_processor_class_for_task(cls, task: str) -> Type: + def get_task_processor_class_for_task(cls, task: str) -> Type["TaskProcessor"]: if task not in cls._TASK_TO_DATASET_PROCESSING_CLASS: supported_tasks = ", ".join(cls._TASK_TO_DATASET_PROCESSING_CLASS.keys()) raise KeyError( @@ -45,5 +45,5 @@ def get_task_processor_class_for_task(cls, task: str) -> Type: return cls._TASK_TO_DATASET_PROCESSING_CLASS[task] @classmethod - def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "DatasetProcessing": + def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "TaskProcessor": return cls.get_task_processor_class_for_task(task)(*dataset_processing_args, **dataset_processing_kwargs) diff --git a/tests/utils/test_dummpy_input_generators.py b/tests/utils/test_dummpy_input_generators.py index 9dd83714c5f..ff9558f1477 100644 --- a/tests/utils/test_dummpy_input_generators.py +++ b/tests/utils/test_dummpy_input_generators.py @@ -31,13 +31,13 @@ from optimum.utils.input_generators import DummyInputGenerator -TEXT_ENCODER_MODELS = {"distilbert": "distilbert-base-cased"} +TEXT_ENCODER_MODELS = {"distilbert": "hf-internal-testing/tiny-random-DistilBertModel"} VISION_MODELS = {"resnet": "hf-internal-testing/tiny-random-resnet"} -SEQ2SEQ_MODELS = {"t5": "t5-small"} +SEQ2SEQ_MODELS = {"t5": "hf-internal-testing/tiny-random-T5Model"} -AUDIO_MODELS = {"whisper": "openai/whisper-tiny.en"} +AUDIO_MODELS = {"whisper": "hf-internal-testing/tiny-random-WhisperModel"} DUMMY_SHAPES = { "batch_size": [2, 4], @@ -60,7 +60,7 @@ class GenerateDummy(TestCase): "np": tuple, } if is_tf_available(): - import tensorflow as tf + import tensorflow as tf # type: ignore[import] _FRAMEWORK_TO_SHAPE_CLS["tf"] = tf.TensorShape diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index f8a0a6d5a92..af89aec2b90 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -55,6 +55,9 @@ }, } +LOAD_SMALLEST_SPLIT = True +NUM_SAMPLES = 10 + # Taken from https://pynative.com/python-generate-random-string/ def get_random_string(length: int) -> str: @@ -148,7 +151,11 @@ def _test_load_dataset( ) dataset_with_all_columns = None if default_dataset: - dataset = task_processor.load_default_dataset(only_keep_necessary_columns=only_keep_necessary_columns) + dataset = task_processor.load_default_dataset( + only_keep_necessary_columns=only_keep_necessary_columns, + load_smallest_split=LOAD_SMALLEST_SPLIT, + num_samples=NUM_SAMPLES, + ) if only_keep_necessary_columns: dataset_with_all_columns = task_processor.load_default_dataset() else: @@ -157,11 +164,17 @@ def _test_load_dataset( path, data_keys=data_keys, only_keep_necessary_columns=only_keep_necessary_columns, + load_smallest_split=LOAD_SMALLEST_SPLIT, + num_samples=NUM_SAMPLES, **load_dataset_kwargs, ) if only_keep_necessary_columns: dataset_with_all_columns = task_processor.load_dataset( - path, data_keys=data_keys, **load_dataset_kwargs + path, + data_keys=data_keys, + load_smallest_split=LOAD_SMALLEST_SPLIT, + num_samples=NUM_SAMPLES, + **load_dataset_kwargs, ) # We only check if the column names of the dataset with the not necessary columns removed are a strict subset From e81bd73a778b4833b8b2781c16b4427b7aa1111c Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 27 May 2024 16:46:15 +0200 Subject: [PATCH 16/31] Fix BT CI (#1872) * fix bt test failures due to default sdpa attention * exclude macos13+py3.8 * update tr * check transformers version --- .github/workflows/test_bettertransformer.yml | 54 +++++++++++--------- optimum/pipelines/pipelines_base.py | 9 +++- tests/bettertransformer/test_encoder.py | 2 +- tests/bettertransformer/testing_utils.py | 2 +- 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml index 6607466dc22..080d8272dfc 100644 --- a/.github/workflows/test_bettertransformer.yml +++ b/.github/workflows/test_bettertransformer.yml @@ -2,9 +2,9 @@ name: BetterTransformer / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,29 +17,35 @@ jobs: matrix: python-version: [3.8, 3.9] os: [ubuntu-20.04, macos-13] + exclude: [{ python-version: 3.8, os: macos-13 }] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install .[tests] - pip install --no-cache-dir --upgrade torch torchvision torchaudio - pip install accelerate - - name: Test on pytorch stable - working-directory: tests - run: | - pytest bettertransformer/test_*.py -s -vvvvv - - name: Install dependencies 2 - run: | - pip uninstall -y torch torchvision torchaudio - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu - - name: Test on pytorch nightly - working-directory: tests - run: | - pytest bettertransformer/test_*.py -s -vvvvv + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install .[tests] + pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install accelerate + + - name: Test with stable pytorch + working-directory: tests + run: | + pytest bettertransformer -s -vvvvv + + - name: Install dependencies 2 + run: | + pip uninstall -y torch torchvision torchaudio + pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + + - name: Test with nightly pytorch + working-directory: tests + run: | + pytest bettertransformer -s -vvvvv diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py index e2046882bd6..cc36e94ef5d 100644 --- a/optimum/pipelines/pipelines_base.py +++ b/optimum/pipelines/pipelines_base.py @@ -45,7 +45,7 @@ from transformers.pipelines import infer_framework_load_model from ..bettertransformer import BetterTransformer -from ..utils import is_onnxruntime_available +from ..utils import check_if_transformers_greater, is_onnxruntime_available from ..utils.file_utils import find_files_matching_pattern @@ -179,7 +179,12 @@ def load_bettertransformer( **kwargs, ): if model_kwargs is None: - model_kwargs = {} + # the argument was first introduced in 4.36.0 but most models didn't have an sdpa implementation then + # see https://github.com/huggingface/transformers/blob/v4.36.0/src/transformers/modeling_utils.py#L1258 + if check_if_transformers_greater("4.36.0"): + model_kwargs = {"attn_implementation": "eager"} + else: + model_kwargs = {} if model is None: model_id = SUPPORTED_TASKS[targeted_task]["default"] diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py index cbf4bcbae90..74aacaed58c 100644 --- a/tests/bettertransformer/test_encoder.py +++ b/tests/bettertransformer/test_encoder.py @@ -114,7 +114,7 @@ def test_inference_speed(self): """ model_name = "bert-base-uncased" - hf_model = AutoModel.from_pretrained(model_name).eval() + hf_model = AutoModel.from_pretrained(model_name, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_model, keep_original_model=True) BATCH_SIZE = 8 diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index eb4f0ab9a4d..6e7ff71ddd9 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -235,7 +235,7 @@ def _test_logits(self, model_id: str, model_type: str, **preprocessor_kwargs): inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config hf_random_model = hf_random_model.eval() From 6d56c5fadd94e388efe2674820f167ab7c004a6f Mon Sep 17 00:00:00 2001 From: Sarthak Gupta <81774392+mr-sarthakgupta@users.noreply.github.com> Date: Tue, 28 May 2024 13:49:57 +0530 Subject: [PATCH 17/31] Fix ORTConfig loading (#1879) --- optimum/commands/onnxruntime/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/commands/onnxruntime/quantize.py index 0ce7e6c3dce..2613cb33ba6 100644 --- a/optimum/commands/onnxruntime/quantize.py +++ b/optimum/commands/onnxruntime/quantize.py @@ -96,7 +96,7 @@ def run(self): "TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization" ) else: - qconfig = ORTConfig.from_pretained(self.args.config).quantization + qconfig = ORTConfig.from_pretrained(self.args.config).quantization for q in quantizers: q.quantize(save_dir=save_dir, quantization_config=qconfig) From f3008651c6f674d4b89de66a2d21fc5e7cafaf84 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Tue, 28 May 2024 14:02:19 +0530 Subject: [PATCH 18/31] Update ORT doc for ROCM 6.0 (#1862) * Update ORT doc for ROCM 6.0 * Update amdgpu.mdx --- docs/source/onnxruntime/usage_guides/amdgpu.mdx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/onnxruntime/usage_guides/amdgpu.mdx b/docs/source/onnxruntime/usage_guides/amdgpu.mdx index acd8d732ac3..575f7700ce9 100644 --- a/docs/source/onnxruntime/usage_guides/amdgpu.mdx +++ b/docs/source/onnxruntime/usage_guides/amdgpu.mdx @@ -7,11 +7,11 @@ Our testing involved AMD Instinct GPUs, and for specific GPU compatibility, plea This guide will show you how to run inference on the `ROCMExecutionProvider` execution provider that ONNX Runtime supports for AMD GPUs. ## Installation -The following setup installs the ONNX Runtime support with ROCM Execution Provider with ROCm 5.7. +The following setup installs the ONNX Runtime support with ROCM Execution Provider with ROCm 6.0. #### 1 ROCm Installation -Refer to the [ROCm installation guide](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) to install ROCm 5.7. +Refer to the [ROCm installation guide](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) to install ROCm 6.0. #### 2 Installing `onnxruntime-rocm` @@ -26,11 +26,11 @@ docker build -f Dockerfile -t ort/rocm . **Local Installation Steps:** ##### 2.1 PyTorch with ROCm Support -Optimum ONNX Runtime integration relies on some functionalities of Transformers that require PyTorch. For now, we recommend to use Pytorch compiled against RoCm 5.7, that can be installed following [PyTorch installation guide](https://pytorch.org/get-started/locally/): +Optimum ONNX Runtime integration relies on some functionalities of Transformers that require PyTorch. For now, we recommend to use Pytorch compiled against RoCm 6.0, that can be installed following [PyTorch installation guide](https://pytorch.org/get-started/locally/): ```bash -pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7 -# Use 'rocm/pytorch:latest' as the preferred base image when using Docker for PyTorch installation. +pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 +# Use 'rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2' as the preferred base image when using Docker for PyTorch installation. ``` ##### 2.2 ONNX Runtime with ROCm Execution Provider @@ -42,13 +42,13 @@ pip install cmake onnx curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh # Install ONNXRuntime from source -git clone --recursive https://github.com/ROCmSoftwarePlatform/onnxruntime.git +git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime cd onnxruntime -git checkout rocm5.7_internal_testing_eigen-3.4.zip_hash -./build.sh --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --rocm_home=/opt/rocm +./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=gfx90a,gfx942 ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --rocm_home=/opt/rocm pip install build/Linux/Release/dist/* ``` +Note: The instructions build ORT for `MI210/MI250/MI300` gpus. To support other architectures, please update the `CMAKE_HIP_ARCHITECTURES` in the build command. To avoid conflicts between `onnxruntime` and `onnxruntime-rocm`, make sure the package `onnxruntime` is not installed by running `pip uninstall onnxruntime` prior to installing `onnxruntime-rocm`. From cbbda3e43284c49a02732375cfcabc61e4923046 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 28 May 2024 18:03:08 +0200 Subject: [PATCH 19/31] Fix ort config instantiation (from_pretrained) and saving (save_pretrained) (#1865) * fix ort config instatiation (from_dict) and saving (to_dict) * added tests for quantization with ort config * style * handle empty quant dictionary --- .github/workflows/test_cli.yml | 33 ++++++++++--------- optimum/onnxruntime/configuration.py | 49 ++++++++++++++++++++++++++-- tests/cli/test_cli.py | 31 +++++++++--------- 3 files changed, 80 insertions(+), 33 deletions(-) diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index 7eae0186076..ecb19d23aa3 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -4,9 +4,9 @@ name: Optimum CLI / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -22,17 +22,20 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install .[tests,exporters,exporters-tf] - - name: Test with unittest - working-directory: tests - run: | - python -m unittest discover -s cli -p 'test_*.py' + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[tests,exporters,exporters-tf] + + - name: Test with pytest + run: | + pytest tests/cli -s -vvvv --durations=0 diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py index c11cf58b8b0..2e3d9f32d6a 100644 --- a/optimum/onnxruntime/configuration.py +++ b/optimum/onnxruntime/configuration.py @@ -18,7 +18,7 @@ from dataclasses import asdict, dataclass, field from enum import Enum from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from datasets import Dataset from packaging.version import Version, parse @@ -298,6 +298,15 @@ def __post_init__(self): ) self.operators_to_quantize = operators_to_quantize + if isinstance(self.format, str): + self.format = QuantFormat[self.format] + if isinstance(self.mode, str): + self.mode = QuantizationMode[self.mode] + if isinstance(self.activations_dtype, str): + self.activations_dtype = QuantType[self.activations_dtype] + if isinstance(self.weights_dtype, str): + self.weights_dtype = QuantType[self.weights_dtype] + @staticmethod def quantization_type_str(activations_dtype: QuantType, weights_dtype: QuantType) -> str: return ( @@ -984,8 +993,28 @@ def __init__( self.opset = opset self.use_external_data_format = use_external_data_format self.one_external_file = one_external_file - self.optimization = self.dataclass_to_dict(optimization) - self.quantization = self.dataclass_to_dict(quantization) + + if isinstance(optimization, dict) and optimization: + self.optimization = OptimizationConfig(**optimization) + elif isinstance(optimization, OptimizationConfig): + self.optimization = optimization + elif not optimization: + self.optimization = None + else: + raise ValueError( + f"Optional argument `optimization` must be a dictionary or an instance of OptimizationConfig, got {type(optimization)}" + ) + if isinstance(quantization, dict) and quantization: + self.quantization = QuantizationConfig(**quantization) + elif isinstance(quantization, QuantizationConfig): + self.quantization = quantization + elif not quantization: + self.quantization = None + else: + raise ValueError( + f"Optional argument `quantization` must be a dictionary or an instance of QuantizationConfig, got {type(quantization)}" + ) + self.optimum_version = kwargs.pop("optimum_version", None) @staticmethod @@ -1002,3 +1031,17 @@ def dataclass_to_dict(config) -> dict: v = [elem.name if isinstance(elem, Enum) else elem for elem in v] new_config[k] = v return new_config + + def to_dict(self) -> Dict[str, Any]: + dict_config = { + "opset": self.opset, + "use_external_data_format": self.use_external_data_format, + "one_external_file": self.one_external_file, + "optimization": self.dataclass_to_dict(self.optimization), + "quantization": self.dataclass_to_dict(self.quantization), + } + + if self.optimum_version: + dict_config["optimum_version"] = self.optimum_version + + return dict_config diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index 2e64dc9cdfb..ca4ebf8bd23 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -21,10 +21,8 @@ import unittest from pathlib import Path -from onnxruntime import __version__ as ort_version -from packaging.version import Version, parse - import optimum.commands +from optimum.onnxruntime.configuration import AutoQuantizationConfig, ORTConfig CLI_WIH_CUSTOM_COMMAND_PATH = Path(__file__).parent / "cli_with_custom_command.py" @@ -83,30 +81,33 @@ def test_optimize_commands(self): def test_quantize_commands(self): with tempfile.TemporaryDirectory() as tempdir: + ort_config = ORTConfig(quantization=AutoQuantizationConfig.avx2(is_static=False)) + ort_config.save_pretrained(tempdir) + # First export a tiny encoder, decoder only and encoder-decoder export_commands = [ - f"optimum-cli export onnx --model hf-internal-testing/tiny-random-BertModel {tempdir}/encoder", + f"optimum-cli export onnx --model hf-internal-testing/tiny-random-bert {tempdir}/encoder", f"optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 {tempdir}/decoder", - # f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder", + f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder", ] quantize_commands = [ f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder --avx2 -o {tempdir}/quantized_encoder", f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/decoder --avx2 -o {tempdir}/quantized_decoder", - # f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder", + f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder", ] - if parse(ort_version) != Version("1.16.0") and parse(ort_version) != Version("1.17.0"): - # Failing on onnxruntime==1.17.0, will be fixed on 1.17.1: https://github.com/microsoft/onnxruntime/pull/19421 - export_commands.append( - f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder" - ) - quantize_commands.append( - f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder" - ) + quantize_with_config_commands = [ + f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-bert --c {tempdir}/ort_config.json -o {tempdir}/quantized_encoder_with_config", + f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-gpt2 --c {tempdir}/ort_config.json -o {tempdir}/quantized_decoder_with_config", + f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-t5 --c {tempdir}/ort_config.json -o {tempdir}/quantized_encoder_decoder_with_config", + ] - for export, quantize in zip(export_commands, quantize_commands): + for export, quantize, quantize_with_config in zip( + export_commands, quantize_commands, quantize_with_config_commands + ): subprocess.run(export, shell=True, check=True) subprocess.run(quantize, shell=True, check=True) + subprocess.run(quantize_with_config, shell=True, check=True) def _run_command_and_check_content(self, command: str, content: str) -> bool: proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) From fbbc408b98187b51eaa6bcddc623e928b8e41f99 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 29 May 2024 14:07:54 +0200 Subject: [PATCH 20/31] Fix ORT CI (#1875) * fix quantizer * change diffusion test * install cpu torch * fix * fix only for qdq quantizer * fix past kv in old model * warn * assert text equal * Update optimum/onnxruntime/modeling_decoder.py Co-authored-by: Michael Benayoun * use action to free disk * change input text * keep large packages * update python * test with original gpt2 tokenizer * test * run everything * Update tests/onnxruntime/test_modeling.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * make old onnx model inference tests run in series as they modify the underlying model * my bad * compare optimum ort sd with diffusers onnx sd because they're the only ones using np random states * seperate ort test subsets to propagate each subsets' return code --------- Co-authored-by: Ella Charlaix Co-authored-by: Michael Benayoun Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- .github/workflows/test_onnxruntime.yml | 94 +++++++------------ optimum/onnxruntime/modeling_decoder.py | 39 ++++++-- optimum/onnxruntime/quantization.py | 91 ++++++++---------- tests/onnxruntime/test_modeling.py | 23 +++-- .../test_stable_diffusion_pipeline.py | 39 ++++---- 5 files changed, 131 insertions(+), 155 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index f173cc6c6bd..4893b681a66 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -4,9 +4,9 @@ name: ONNX Runtime / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -22,62 +22,34 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - - name: Free disk space - if: matrix.os == 'ubuntu-20.04' - run: | - df -h - sudo apt-get update - sudo apt-get purge -y '^apache.*' - sudo apt-get purge -y '^imagemagick.*' - sudo apt-get purge -y '^dotnet.*' - sudo apt-get purge -y '^aspnetcore.*' - sudo apt-get purge -y 'php.*' - sudo apt-get purge -y '^temurin.*' - sudo apt-get purge -y '^mysql.*' - sudo apt-get purge -y '^java.*' - sudo apt-get purge -y '^openjdk.*' - sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel - df -h - sudo apt-get autoremove -y >/dev/null 2>&1 - sudo apt-get clean - df -h - echo "https://github.com/actions/virtual-environments/issues/709" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - df -h - echo "remove big /usr/local" - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf /usr/local/lib/android >/dev/null 2>&1 - df -h - echo "remove /usr/share leftovers" - sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1 - sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1 - sudo rm -rf /usr/share/swift > /dev/null 2>&1 - df -h - echo "remove other leftovers" - sudo rm -rf /var/lib/mysql > /dev/null 2>&1 - sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1 - sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1 - sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1 - sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1 - sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1 - sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1 - sudo rm -rf /usr/lib/heroku > /dev/null 2>&1 - sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1 - df -h - - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - pip install .[tests,onnxruntime] - - - name: Test with pytest - working-directory: tests - run: | - pytest -n auto -m "not run_in_series" --durations=0 -vs onnxruntime - pytest -m "run_in_series" --durations=0 onnxruntime + - name: Free Disk Space (Ubuntu) + if: matrix.os == 'ubuntu-20.04' + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + swap-storage: false + large-packages: false + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[tests,onnxruntime] + + - name: Test with pytest (in series) + working-directory: tests + run: | + pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s + + - name: Test with pytest (in parallel) + working-directory: tests + run: | + pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 455236126b6..2d9be2d757f 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -509,8 +509,6 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - # Since v1.7.0 decoder with past models have fixed sequence length of 1 - # To keep these models compatible we set this dimension to dynamic onnx_model = onnx.load(str(model_cache_path), load_external_data=False) model_uses_external_data = check_model_uses_external_data(onnx_model) @@ -521,24 +519,47 @@ def _from_pretrained( node.name: [dim.dim_value or dim.dim_param for dim in node.type.tensor_type.shape.dim] for node in onnx_model.graph.input } + output_dims = { + node.name: [dim.dim_value or dim.dim_param for dim in node.type.tensor_type.shape.dim] + for node in onnx_model.graph.output + } + + override_dims = False + + # Since v1.7.0 decoder with past models have fixed sequence length of 1 + # To keep these models compatible we set this dimension to dynamic if input_dims["input_ids"][1] == 1: input_dims["input_ids"][1] = "sequence_length" - output_dims = { - node.name: [dim.dim_value or dim.dim_param for dim in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output - } output_dims["logits"][1] = "sequence_length" - onnx_model = update_model_dims.update_inputs_outputs_dims(onnx_model, input_dims, output_dims) + override_dims = True + # Since https://github.com/huggingface/optimum/pull/871/ + # changed axis notation/naming during export, we need to update the dims + for dim in input_dims.keys(): + if "past" in dim and input_dims[dim][2] == "past_sequence_length + sequence_length": + input_dims[dim][2] = "past_sequence_length" + override_dims = True + + if override_dims: + # this is kinda dangerous, warning the user is the least we can do + logger.warning( + "The ONNX model was probably exported with an older version of optimum. " + "We are updating the input/output dimensions and overwriting the model file " + "with new dimensions. This is necessary for the model to work correctly with " + "the current version of optimum. If you encounter any issues, please re-export " + "the model with the latest version of optimum for optimal performance." + ) + onnx_model = update_model_dims.update_inputs_outputs_dims(onnx_model, input_dims, output_dims) onnx.save( onnx_model, str(model_cache_path), save_as_external_data=model_uses_external_data, - all_tensors_to_one_file=True, location=model_cache_path.name + "_data", - size_threshold=0, + all_tensors_to_one_file=True, convert_attribute=True, + size_threshold=0, ) + del onnx_model model = ORTModel.load_model( diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index d56e301c3cf..d93a7a31320 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -356,62 +356,45 @@ def quantize( ) quantizer_factory = QDQQuantizer if use_qdq else ONNXQuantizer + # TODO: maybe this logic can be moved to a method in the configuration class (get_ort_quantizer_kwargs()) + # that returns the dictionary of arguments to pass to the quantizer factory depending on the ort version + quantizer_kwargs = { + "model": onnx_model, + "static": quantization_config.is_static, + "per_channel": quantization_config.per_channel, + "mode": quantization_config.mode, + "weight_qType": quantization_config.weights_dtype, + "input_qType": quantization_config.activations_dtype, + "tensors_range": calibration_tensors_range, + "reduce_range": quantization_config.reduce_range, + "nodes_to_quantize": quantization_config.nodes_to_quantize, + "nodes_to_exclude": quantization_config.nodes_to_exclude, + "op_types_to_quantize": [ + operator.value if isinstance(operator, ORTQuantizableOperator) else operator + for operator in quantization_config.operators_to_quantize + ], + "extra_options": { + "WeightSymmetric": quantization_config.weights_symmetric, + "ActivationSymmetric": quantization_config.activations_symmetric, + "EnableSubgraph": has_subgraphs, + "ForceSymmetric": quantization_config.activations_symmetric and quantization_config.weights_symmetric, + "AddQDQPairToWeight": quantization_config.qdq_add_pair_to_weight, + "DedicatedQDQPair": quantization_config.qdq_dedicated_pair, + "QDQOpTypePerChannelSupportToAxis": quantization_config.qdq_op_type_per_channel_support_to_axis, + }, + } + + if use_qdq: + quantizer_kwargs.pop("mode") + if parse(ort_version) >= Version("1.18.0"): + # The argument `static` has been removed from the qdq quantizer factory in ORT 1.18 + quantizer_kwargs.pop("static") if parse(ort_version) >= Version("1.13.0"): - # The argument `input_qType` has been changed into `activation_qType` from ORT 1.13 - quantizer = quantizer_factory( - model=onnx_model, - static=quantization_config.is_static, - per_channel=quantization_config.per_channel, - mode=quantization_config.mode, - weight_qType=quantization_config.weights_dtype, - activation_qType=quantization_config.activations_dtype, - tensors_range=calibration_tensors_range, - reduce_range=quantization_config.reduce_range, - nodes_to_quantize=quantization_config.nodes_to_quantize, - nodes_to_exclude=quantization_config.nodes_to_exclude, - op_types_to_quantize=[ - operator.value if isinstance(operator, ORTQuantizableOperator) else operator - for operator in quantization_config.operators_to_quantize - ], - extra_options={ - "WeightSymmetric": quantization_config.weights_symmetric, - "ActivationSymmetric": quantization_config.activations_symmetric, - "EnableSubgraph": has_subgraphs, - "ForceSymmetric": quantization_config.activations_symmetric - and quantization_config.weights_symmetric, - "AddQDQPairToWeight": quantization_config.qdq_add_pair_to_weight, - "DedicatedQDQPair": quantization_config.qdq_dedicated_pair, - "QDQOpTypePerChannelSupportToAxis": quantization_config.qdq_op_type_per_channel_support_to_axis, - }, - ) - else: - quantizer = quantizer_factory( - model=onnx_model, - static=quantization_config.is_static, - per_channel=quantization_config.per_channel, - mode=quantization_config.mode, - weight_qType=quantization_config.weights_dtype, - input_qType=quantization_config.activations_dtype, - tensors_range=calibration_tensors_range, - reduce_range=quantization_config.reduce_range, - nodes_to_quantize=quantization_config.nodes_to_quantize, - nodes_to_exclude=quantization_config.nodes_to_exclude, - op_types_to_quantize=[ - operator.value if isinstance(operator, ORTQuantizableOperator) else operator - for operator in quantization_config.operators_to_quantize - ], - extra_options={ - "WeightSymmetric": quantization_config.weights_symmetric, - "ActivationSymmetric": quantization_config.activations_symmetric, - "EnableSubgraph": False, - "ForceSymmetric": quantization_config.activations_symmetric - and quantization_config.weights_symmetric, - "AddQDQPairToWeight": quantization_config.qdq_add_pair_to_weight, - "DedicatedQDQPair": quantization_config.qdq_dedicated_pair, - "QDQOpTypePerChannelSupportToAxis": quantization_config.qdq_op_type_per_channel_support_to_axis, - }, - ) + # The argument `input_qType` has been changed into `activation_qType` in ORT 1.13 + quantizer_kwargs["activation_qType"] = quantizer_kwargs.pop("input_qType") + + quantizer = quantizer_factory(**quantizer_kwargs) LOGGER.info("Quantizing model...") quantizer.quantize_model() diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 182e64beb90..3fe2c5e14dc 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2274,21 +2274,25 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): SPEEDUP_CACHE = 1.1 @parameterized.expand([(False,), (True,)]) + @pytest.mark.run_in_series def test_inference_old_onnx_model(self, use_cache): - model_id = "optimum/gpt2" + tokenizer = get_preprocessor("gpt2") model = AutoModelForCausalLM.from_pretrained("gpt2") - tokenizer = get_preprocessor(model_id) - text = "This is a sample output" - tokens = tokenizer(text, return_tensors="pt") - onnx_model = ORTModelForCausalLM.from_pretrained(model_id, use_cache=use_cache, use_io_binding=use_cache) + onnx_model = ORTModelForCausalLM.from_pretrained("optimum/gpt2", use_cache=use_cache, use_io_binding=use_cache) self.assertEqual(onnx_model.use_cache, use_cache) self.assertEqual(onnx_model.model_path.name, ONNX_DECODER_WITH_PAST_NAME if use_cache else ONNX_DECODER_NAME) - outputs_onnx = onnx_model.generate( - **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30 + + text = "The capital of France is" + tokens = tokenizer(text, return_tensors="pt") + + onnx_outputs = onnx_model.generate( + **tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10 ) - outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30) - self.assertTrue(torch.allclose(outputs_onnx, outputs)) + outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10) + onnx_text_outputs = tokenizer.decode(onnx_outputs[0], skip_special_tokens=True) + text_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True) + self.assertEqual(onnx_text_outputs, text_outputs) def test_load_model_from_hub_onnx(self): model = ORTModelForCausalLM.from_pretrained("fxmarty/onnx-tiny-random-gpt2-without-merge") @@ -3596,6 +3600,7 @@ def _get_onnx_model_dir(self, model_id, model_arch, test_name): return onnx_model_dir + @pytest.mark.run_in_series def test_inference_old_onnx_model(self): model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py index 0e56b22f712..44cd22ffecc 100644 --- a/tests/onnxruntime/test_stable_diffusion_pipeline.py +++ b/tests/onnxruntime/test_stable_diffusion_pipeline.py @@ -227,20 +227,18 @@ def test_compare_diffusers_pipeline(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width = 128, 128 - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = self.generate_inputs(height=height, width=width) inputs["prompt"] = "A painting of a squirrel eating a burger" inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) - output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] - # https://github.com/huggingface/diffusers/blob/v0.17.1/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py#L71 - expected_slice = np.array([0.69643, 0.58484, 0.50314, 0.58760, 0.55368, 0.59643, 0.51529, 0.41217, 0.49087]) - self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images + + diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - # Verify it can be loaded with ORT diffusers pipeline - diffusers_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_output = diffusers_pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] - self.assertTrue(np.allclose(output, diffusers_output, atol=1e-2)) + self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): inputs = _generate_inputs(batch_size=batch_size) @@ -418,6 +416,7 @@ def test_compare_diffusers_pipeline(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) height, width = 64, 64 latents_shape = ( 1, @@ -425,22 +424,18 @@ def test_compare_diffusers_pipeline(self, model_arch: str): height // ort_pipeline.vae_scale_factor, width // ort_pipeline.vae_scale_factor, ) - latents = np.random.randn(*latents_shape).astype(np.float32) inputs = self.generate_inputs(height=height, width=width) - inputs["image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - inputs["mask_image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ).resize((width, height)) + np_latents = np.random.rand(*latents_shape).astype(np.float32) + torch_latents = torch.from_numpy(np_latents) + + ort_outputs = ort_pipeline(**inputs, latents=np_latents).images + self.assertEqual(ort_outputs.shape, (1, height, width, 3)) + + diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images + self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) - outputs = ort_pipeline(**inputs, latents=latents).images - self.assertEqual(outputs.shape, (1, height, width, 3)) - expected_slice = np.array([0.5442, 0.3002, 0.5665, 0.6485, 0.4421, 0.6441, 0.5778, 0.5076, 0.5612]) - self.assertTrue(np.allclose(outputs[0, -3:, -3:, -1].flatten(), expected_slice, atol=1e-4)) + self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) def generate_inputs(self, height=128, width=128, batch_size=1): inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) From e9af0f20e9ddb99c56b1a8e35b5805eb9bed5e52 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 29 May 2024 16:02:46 +0200 Subject: [PATCH 21/31] update optimum intel extra (#1882) --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 407f6a2a3fb..4c6ee680d99 100644 --- a/setup.py +++ b/setup.py @@ -75,10 +75,10 @@ "transformers[sentencepiece]>=4.26.0,<4.38.0", ], "diffusers": ["diffusers"], - "intel": "optimum-intel>=1.15.0", - "openvino": "optimum-intel[openvino]>=1.15.0", - "nncf": "optimum-intel[nncf]>=1.15.0", - "neural-compressor": "optimum-intel[neural-compressor]>=1.15.0", + "intel": "optimum-intel>=1.16.0", + "openvino": "optimum-intel[openvino]>=1.16.0", + "nncf": "optimum-intel[nncf]>=1.16.0", + "neural-compressor": "optimum-intel[neural-compressor]>=1.16.0", "graphcore": "optimum-graphcore", "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"], "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers == 4.36.2"], From a2562e6a7bddb367d1dfe48f34e40712d24fa477 Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Wed, 29 May 2024 16:03:01 +0200 Subject: [PATCH 22/31] Bump transformers version for neuron extras (#1881) --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4c6ee680d99..b40eba068d5 100644 --- a/setup.py +++ b/setup.py @@ -81,8 +81,8 @@ "neural-compressor": "optimum-intel[neural-compressor]>=1.16.0", "graphcore": "optimum-graphcore", "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"], - "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers == 4.36.2"], - "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers == 4.36.2"], + "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"], + "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"], "furiosa": "optimum-furiosa", "amd": "optimum-amd", "dev": TESTS_REQUIRE + QUALITY_REQUIRE, From 7a0757aeeb7a5a09d0f4e68aadfe0654277a0298 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 29 May 2024 16:35:15 +0200 Subject: [PATCH 23/31] Dev version --- optimum/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/version.py b/optimum/version.py index b71e4d4a8c3..6deb421ee56 100644 --- a/optimum/version.py +++ b/optimum/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.20.0.dev0" +__version__ = "1.21.0.dev0" From ac951ca0ae49983b1515b84342fea15c3b7ec35c Mon Sep 17 00:00:00 2001 From: Zach Deane-Mayer <581590+zachmayer@users.noreply.github.com> Date: Wed, 5 Jun 2024 02:18:42 -0400 Subject: [PATCH 24/31] ORTOptimizer for the model type Segformer (#1820) * add segformer * black * make format * decoder_hidden_size not a list * tests pass now * use max * use zero --------- Co-authored-by: Zach Deane-Mayer --- optimum/onnxruntime/modeling_ort.py | 11 ++++++++--- optimum/onnxruntime/utils.py | 1 + optimum/utils/normalized_config.py | 15 ++++++++++++++- tests/onnxruntime/test_optimization.py | 2 ++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index eb38a7fef12..b65e1d3b29a 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -1746,13 +1746,18 @@ class ORTModelForSemanticSegmentation(ORTModel): checkpoint="optimum/segformer-b0-finetuned-ade-512-512", ) ) - def forward(self, **kwargs): - use_torch = isinstance(next(iter(kwargs.values())), torch.Tensor) + def forward( + self, + pixel_values: Union[torch.Tensor, np.ndarray], + **kwargs, + ): + use_torch = isinstance(pixel_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: io_binding = IOBindingHelper.prepare_io_binding( self, + pixel_values, **kwargs, ordered_input_names=self._ordered_input_names, ) @@ -1769,7 +1774,7 @@ def forward(self, **kwargs): # converts output to namedtuple for pipelines post-processing return SemanticSegmenterOutput(logits=outputs["logits"]) else: - onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, **kwargs) + onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, pixel_values=pixel_values, **kwargs) # run inference onnx_outputs = self.model.run(None, onnx_inputs) diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 0e1da447a64..37d0feefcc4 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -128,6 +128,7 @@ class ORTConfigManager: "nystromformer": "bert", "pegasus": "bert", "roberta": "bert", + "segformer": "vit", "t5": "bert", "vit": "vit", "whisper": "bart", diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 682f70e3ca3..81207b76496 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -102,6 +102,19 @@ class NormalizedVisionConfig(NormalizedConfig): INPUT_SIZE = "input_size" +class NormalizedSegformerConfig(NormalizedVisionConfig): + NUM_ATTENTION_HEADS = "num_attention_heads" + HIDDEN_SIZE = "hidden_sizes" + + # If the attribute is a list, return 0 + # 0 means let the optimizer infer the correct value based on the model graph + def __getattr__(self, attr_name): + attr_value = super().__getattr__(attr_name) + if isinstance(attr_value, list): + attr_value = 0 + return attr_value + + class NormalizedTextAndVisionConfig(NormalizedTextConfig, NormalizedVisionConfig): TEXT_CONFIG = None VISION_CONFIG = None @@ -203,7 +216,6 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', - 'segformer', 'squeezebert', 'table-transformer', """ @@ -258,6 +270,7 @@ class NormalizedConfigManager: "regnet": NormalizedVisionConfig, "resnet": NormalizedVisionConfig, "roberta": NormalizedTextConfig, + "segformer": NormalizedSegformerConfig, "speech-to-text": SpeechToTextLikeNormalizedTextConfig, "splinter": NormalizedTextConfig, "t5": T5LikeNormalizedTextConfig, diff --git a/tests/onnxruntime/test_optimization.py b/tests/onnxruntime/test_optimization.py index c9cadbaa825..82109fcd11f 100644 --- a/tests/onnxruntime/test_optimization.py +++ b/tests/onnxruntime/test_optimization.py @@ -36,6 +36,7 @@ AutoOptimizationConfig, ORTConfig, ORTModelForImageClassification, + ORTModelForSemanticSegmentation, ORTModelForSequenceClassification, ORTOptimizer, ) @@ -171,6 +172,7 @@ def test_compare_original_seq2seq_model_with_optimized_model(self, model_cls, mo # Contribution note: Please add test models in alphabetical order. Find test models here: https://huggingface.co/hf-internal-testing. SUPPORTED_IMAGE_ARCHITECTURES_WITH_MODEL_ID = ( + (ORTModelForSemanticSegmentation, "hf-internal-testing/tiny-random-segformer"), (ORTModelForImageClassification, "hf-internal-testing/tiny-random-vit"), ) From 113b645dc7d0b7710803f23ffbf937ce6461ed1e Mon Sep 17 00:00:00 2001 From: GoldenTeethCN Date: Thu, 6 Jun 2024 16:42:52 +0800 Subject: [PATCH 25/31] fix: device consistence (#1891) * fix: device consistence * style: make style on ./optimum/gptq/quantizer.py --- optimum/gptq/quantizer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 2c2c9d7e71a..902af87bbb0 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -432,7 +432,10 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - data[k] = v.to(0) + if not has_device_map or device.type == "cpu": + data[k] = v.to(0) + else: + data[k] = v.to(device) try: model(**data) except ValueError: @@ -458,7 +461,10 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - data[k] = v.to(0) + if not has_device_map or device.type == "cpu": + data[k] = v.to(0) + else: + data[k] = v.to(device) try: model(**data) except ValueError: From f33f2f1d84f5da1e347d64d90a393e0e02a9ac5a Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 10 Jun 2024 10:06:44 +0200 Subject: [PATCH 26/31] Allow optimum to discover and load subpackages (#1894) As an alternative to directly adding their commands in a register.py file under the root optimum directory, this adds a decorator to declare a subcommand that can be used by subpackages when they are loaded. This will fix the issue of subcommands 'disappearing' when optimum is upgraded without reinstalling the subpackage. The onnxruntime commands are moved into a subpackage loader directory. This subpackage directory is only loaded (and its commands added) when the onnxruntime is available. This avoids wrongly indicating that the onnxruntime commands are available when the package is actually not installed. --- optimum/commands/__init__.py | 3 +- optimum/commands/optimum_cli.py | 57 +++++++++++-- optimum/onnxruntime/subpackage/__init__.py | 1 + .../subpackage/commands}/__init__.py | 2 - .../subpackage/commands}/base.py | 4 +- .../subpackage/commands}/optimize.py | 4 +- .../subpackage/commands}/quantize.py | 6 +- optimum/subpackages.py | 81 +++++++++++++++++++ 8 files changed, 142 insertions(+), 16 deletions(-) create mode 100644 optimum/onnxruntime/subpackage/__init__.py rename optimum/{commands/onnxruntime => onnxruntime/subpackage/commands}/__init__.py (87%) rename optimum/{commands/onnxruntime => onnxruntime/subpackage/commands}/base.py (91%) rename optimum/{commands/onnxruntime => onnxruntime/subpackage/commands}/optimize.py (96%) rename optimum/{commands/onnxruntime => onnxruntime/subpackage/commands}/quantize.py (95%) create mode 100644 optimum/subpackages.py diff --git a/optimum/commands/__init__.py b/optimum/commands/__init__.py index 540ea4dd863..8a2a276d1c5 100644 --- a/optimum/commands/__init__.py +++ b/optimum/commands/__init__.py @@ -15,5 +15,4 @@ from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand from .env import EnvironmentCommand from .export import ExportCommand, ONNXExportCommand, TFLiteExportCommand -from .onnxruntime import ONNXRuntimeCommand, ONNXRuntimeOptimizeCommand, ONNXRuntimeQuantizeCommand -from .optimum_cli import register_optimum_cli_subcommand +from .optimum_cli import optimum_cli_subcommand diff --git a/optimum/commands/optimum_cli.py b/optimum/commands/optimum_cli.py index 4bae9bb5f82..64a7075c6ce 100644 --- a/optimum/commands/optimum_cli.py +++ b/optimum/commands/optimum_cli.py @@ -17,16 +17,57 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple, Type, Union +from ..subpackages import load_subpackages from ..utils import logging from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand from .env import EnvironmentCommand from .export import ExportCommand -from .onnxruntime import ONNXRuntimeCommand logger = logging.get_logger() -OPTIMUM_CLI_SUBCOMMANDS = [ExportCommand, EnvironmentCommand, ONNXRuntimeCommand] +# The table below contains the optimum-cli root subcommands provided by the optimum package +OPTIMUM_CLI_ROOT_SUBCOMMANDS = [ExportCommand, EnvironmentCommand] + +# The table below is dynamically populated when loading subpackages +_OPTIMUM_CLI_SUBCOMMANDS = [] + + +def optimum_cli_subcommand(parent_command: Optional[Type[BaseOptimumCLICommand]] = None): + """ + A decorator to declare optimum-cli subcommands. + + The declaration of an optimum-cli subcommand looks like this: + + ``` + @optimum_cli_subcommand() + class MySubcommand(BaseOptimumCLICommand): + + ``` + + or + + ``` + @optimum_cli_subcommand(ExportCommand) + class MySubcommand(BaseOptimumCLICommand): + + ``` + + Args: + parent_command: (`Optional[Type[BaseOptimumCLICommand]]`): + The class of the parent command or None if this is a top-level command. Defaults to None. + + """ + + if parent_command is not None and not issubclass(parent_command, BaseOptimumCLICommand): + raise ValueError(f"The parent command {parent_command} must be a subclass of BaseOptimumCLICommand") + + def wrapper(subcommand): + if not issubclass(subcommand, BaseOptimumCLICommand): + raise ValueError(f"The subcommand {subcommand} must be a subclass of BaseOptimumCLICommand") + _OPTIMUM_CLI_SUBCOMMANDS.append((subcommand, parent_command)) + + return wrapper def resolve_command_to_command_instance( @@ -137,15 +178,19 @@ def main(): root = RootOptimumCLICommand("Optimum CLI tool", usage="optimum-cli") parser = root.parser - for subcommand_cls in OPTIMUM_CLI_SUBCOMMANDS: + for subcommand_cls in OPTIMUM_CLI_ROOT_SUBCOMMANDS: register_optimum_cli_subcommand(subcommand_cls, parent_command=root) - commands_in_register = dynamic_load_commands_in_register() + # Load subpackages to give them a chance to declare their own subcommands + load_subpackages() + + # Register subcommands declared by the subpackages or found in the register files under commands/register + commands_to_register = _OPTIMUM_CLI_SUBCOMMANDS + dynamic_load_commands_in_register() command2command_instance = resolve_command_to_command_instance( - root, [parent_command_cls for _, parent_command_cls in commands_in_register if parent_command_cls is not None] + root, [parent_command_cls for _, parent_command_cls in commands_to_register if parent_command_cls is not None] ) - for command_or_command_info, parent_command in commands_in_register: + for command_or_command_info, parent_command in commands_to_register: if parent_command is None: parent_command_instance = root else: diff --git a/optimum/onnxruntime/subpackage/__init__.py b/optimum/onnxruntime/subpackage/__init__.py new file mode 100644 index 00000000000..7029af7132f --- /dev/null +++ b/optimum/onnxruntime/subpackage/__init__.py @@ -0,0 +1 @@ +from .commands import ONNXRuntimeCommand diff --git a/optimum/commands/onnxruntime/__init__.py b/optimum/onnxruntime/subpackage/commands/__init__.py similarity index 87% rename from optimum/commands/onnxruntime/__init__.py rename to optimum/onnxruntime/subpackage/commands/__init__.py index 1b9c24c3b2c..44facf5ea53 100644 --- a/optimum/commands/onnxruntime/__init__.py +++ b/optimum/onnxruntime/subpackage/commands/__init__.py @@ -14,5 +14,3 @@ # limitations under the License. from .base import ONNXRuntimeCommand -from .optimize import ONNXRuntimeOptimizeCommand -from .quantize import ONNXRuntimeQuantizeCommand diff --git a/optimum/commands/onnxruntime/base.py b/optimum/onnxruntime/subpackage/commands/base.py similarity index 91% rename from optimum/commands/onnxruntime/base.py rename to optimum/onnxruntime/subpackage/commands/base.py index 53e3245ea4d..df4414c19d5 100644 --- a/optimum/commands/onnxruntime/base.py +++ b/optimum/onnxruntime/subpackage/commands/base.py @@ -14,11 +14,13 @@ # limitations under the License. """optimum.onnxruntime command-line interface base classes.""" -from .. import BaseOptimumCLICommand, CommandInfo +from optimum.commands import BaseOptimumCLICommand, CommandInfo, optimum_cli_subcommand + from .optimize import ONNXRuntimeOptimizeCommand from .quantize import ONNXRuntimeQuantizeCommand +@optimum_cli_subcommand() class ONNXRuntimeCommand(BaseOptimumCLICommand): COMMAND = CommandInfo( name="onnxruntime", diff --git a/optimum/commands/onnxruntime/optimize.py b/optimum/onnxruntime/subpackage/commands/optimize.py similarity index 96% rename from optimum/commands/onnxruntime/optimize.py rename to optimum/onnxruntime/subpackage/commands/optimize.py index 5890e0a07c7..1dd82f0ee22 100644 --- a/optimum/commands/onnxruntime/optimize.py +++ b/optimum/onnxruntime/subpackage/commands/optimize.py @@ -75,8 +75,8 @@ def parse_args(parser: "ArgumentParser"): return parse_args_onnxruntime_optimize(parser) def run(self): - from ...onnxruntime.configuration import AutoOptimizationConfig, ORTConfig - from ...onnxruntime.optimization import ORTOptimizer + from ...configuration import AutoOptimizationConfig, ORTConfig + from ...optimization import ORTOptimizer if self.args.output == self.args.onnx_model: raise ValueError("The output directory must be different than the directory hosting the ONNX model.") diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/onnxruntime/subpackage/commands/quantize.py similarity index 95% rename from optimum/commands/onnxruntime/quantize.py rename to optimum/onnxruntime/subpackage/commands/quantize.py index 2613cb33ba6..6f6d843cc70 100644 --- a/optimum/commands/onnxruntime/quantize.py +++ b/optimum/onnxruntime/subpackage/commands/quantize.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from .. import BaseOptimumCLICommand +from optimum.commands import BaseOptimumCLICommand if TYPE_CHECKING: @@ -69,8 +69,8 @@ def parse_args(parser: "ArgumentParser"): return parse_args_onnxruntime_quantize(parser) def run(self): - from ...onnxruntime.configuration import AutoQuantizationConfig, ORTConfig - from ...onnxruntime.quantization import ORTQuantizer + from ...configuration import AutoQuantizationConfig, ORTConfig + from ...quantization import ORTQuantizer if self.args.output == self.args.onnx_model: raise ValueError("The output directory must be different than the directory hosting the ONNX model.") diff --git a/optimum/subpackages.py b/optimum/subpackages.py new file mode 100644 index 00000000000..8729581521a --- /dev/null +++ b/optimum/subpackages.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import logging +import sys + + +if sys.version_info >= (3, 8): + from importlib import metadata as importlib_metadata +else: + import importlib_metadata +from importlib.util import find_spec, module_from_spec + +from .utils import is_onnxruntime_available + + +logger = logging.getLogger(__name__) + + +def load_namespace_modules(namespace: str, module: str): + """Load modules with a specific name inside a namespace + + This method operates on namespace packages: + https://packaging.python.org/en/latest/guides/packaging-namespace-packages/ + + For each package inside the specified `namespace`, it looks for the specified `module` and loads it. + + Args: + namespace (`str`): + The namespace containing modules to be loaded. + module (`str`): + The name of the module to load in each namespace package. + """ + for dist in importlib_metadata.distributions(): + dist_name = dist.metadata["Name"] + if not dist_name.startswith(f"{namespace}-"): + continue + package_import_name = dist_name.replace("-", ".") + module_import_name = f"{package_import_name}.{module}" + if module_import_name in sys.modules: + # Module already loaded + continue + backend_spec = find_spec(module_import_name) + if backend_spec is None: + continue + try: + imported_module = module_from_spec(backend_spec) + sys.modules[module_import_name] = imported_module + backend_spec.loader.exec_module(imported_module) + logger.debug(f"Successfully loaded {module_import_name}") + except Exception as e: + logger.error(f"An exception occured while loading {module_import_name}: {e}.") + + +def load_subpackages(): + """Load optimum subpackages + + This method goes through packages inside the `optimum` namespace and loads the `subpackage` module if it exists. + + This module is then in charge of registering the subpackage commands. + """ + SUBPACKAGE_LOADER = "subpackage" + load_namespace_modules("optimum", SUBPACKAGE_LOADER) + + # Load subpackages from internal modules not explicitly defined as namespace packages + loader_name = "." + SUBPACKAGE_LOADER + if is_onnxruntime_available(): + importlib.import_module(loader_name, package="optimum.onnxruntime") From 35f636707f18d9c3f996ee31a8d32515424b94af Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Jun 2024 10:07:16 +0200 Subject: [PATCH 27/31] feat(ci): add trufflehog secrets detector (#1899) --- .github/workflows/trufflehog.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/trufflehog.yml diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml new file mode 100644 index 00000000000..164b4f2f8f7 --- /dev/null +++ b/.github/workflows/trufflehog.yml @@ -0,0 +1,23 @@ +on: + push: + +name: Secret Leaks + +permissions: + contents: read + id-token: write + issues: write + pull-requests: write + +jobs: + trufflehog: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Secret Scanning + uses: trufflesecurity/trufflehog@main + + From db51410ae5ef4cbde7518cf01a997239dffbde1d Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Jun 2024 11:42:08 +0200 Subject: [PATCH 28/31] fix(ci): remove unnecessary permissions (#1904) --- .github/workflows/trufflehog.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 164b4f2f8f7..c71afbbb459 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -3,12 +3,6 @@ on: name: Secret Leaks -permissions: - contents: read - id-token: write - issues: write - pull-requests: write - jobs: trufflehog: runs-on: ubuntu-latest From f4809307e409d5ce698364ad48b69d38e0c406e9 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 14 Jun 2024 16:33:33 +0200 Subject: [PATCH 29/31] Remove read token (#1903) * remove read token * rename var & use org model * style & remove token * fix failing tests on datasets release --- .github/workflows/test_onnxruntime.yml | 2 ++ optimum/utils/testing_utils.py | 3 --- tests/onnxruntime/test_modeling.py | 11 +++++++---- tests/utils/test_task_processors.py | 7 ++++++- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 4893b681a66..291a3b08335 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -50,6 +50,8 @@ jobs: pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s - name: Test with pytest (in parallel) + env: + FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} working-directory: tests run: | pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index f1c2f668e3c..a7c2b8bb050 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -36,9 +36,6 @@ # Used to test the hub USER = "__DUMMY_OPTIMUM_USER__" -# Not critical, only usable on the sandboxed CI instance. -TOKEN = "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" - def flatten_dict(dictionary: Dict): """ diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 3fe2c5e14dc..7b2c8a66b9e 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -938,11 +938,14 @@ def test_stable_diffusion_model_on_rocm_ep_str(self): self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) def test_load_model_from_hub_private(self): - subprocess.run("huggingface-cli logout", shell=True) - # Read token of fxmartyclone (dummy user). - token = "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" + token = os.environ.get("HF_HUB_READ_TOKEN", None) - model = ORTModelForCustomTasks.from_pretrained("fxmartyclone/tiny-onnx-private-2", use_auth_token=token) + if token is None: + self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.") + + model = ORTModelForCustomTasks.from_pretrained( + "optimum-internal-testing/tiny-random-phi-private", use_auth_token=token + ) self.assertIsInstance(model.model, onnxruntime.InferenceSession) self.assertIsInstance(model.config, PretrainedConfig) diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index af89aec2b90..16567048073 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -50,7 +50,7 @@ "dataset_data_keys": {"question": "question", "context": "answer"}, }, "image-classification": { - "dataset_args": "mnist", + "dataset_args": "sasha/dog-food", "dataset_data_keys": {"image": "image"}, }, } @@ -232,6 +232,11 @@ def test_load_dataset_with_max_length(self): input_ids = dataset[0]["input_ids"] self.assertEqual(len(input_ids), max_length) + def test_load_default_dataset(self): + self.skipTest( + "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" + ) + class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): TASK_NAME = "question-answering" From 8b43dd2f9fa17c2e08520bf61d1bdc17b8115d69 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:14:50 +0200 Subject: [PATCH 30/31] Remove dataset with restrictive license (#1910) * rm dataset with restrictive license * format --- optimum/gptq/data.py | 41 ++++++--------------------------- tests/gptq/test_quantization.py | 2 +- 2 files changed, 8 insertions(+), 35 deletions(-) diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index 37a42714fc8..b8734da478e 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -182,40 +182,11 @@ def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train") def get_ptb(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if split == "train": - data = load_dataset("ptb_text_only", "penn_treebank", split="train") - elif split == "validation": - data = load_dataset("ptb_text_only", "penn_treebank", split="validation") - - enc = tokenizer(" ".join(data["sentence"]), return_tensors="pt") - - dataset = [] - for _ in range(nsamples): - i = random.randint(0, enc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = enc.input_ids[:, i:j] - attention_mask = torch.ones_like(inp) - dataset.append({"input_ids": inp, "attention_mask": attention_mask}) - - return dataset + raise RuntimeError("Loading the `ptb` dataset was deprecated") def get_ptb_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if split == "train": - data = load_dataset("ptb_text_only", "penn_treebank", split="train") - elif split == "validation": - data = load_dataset("ptb_text_only", "penn_treebank", split="test") - - enc = tokenizer(" ".join(data["sentence"]), return_tensors="pt") - - dataset = [] - for _ in range(nsamples): - i = random.randint(0, enc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = enc.input_ids[:, i:j] - attention_mask = torch.ones_like(inp) - dataset.append({"input_ids": inp, "attention_mask": attention_mask}) - return dataset + raise RuntimeError("Loading the `ptb` dataset was deprecated") def get_dataset( @@ -226,7 +197,7 @@ def get_dataset( Args: dataset_name (`str`): - Dataset name. Available options are `['wikitext2', 'c4', 'ptb', 'c4-new', 'ptb_new']`. + Dataset name. Available options are `['wikitext2', 'c4', 'c4-new']`. tokenizer (`Any`): Tokenizer of the model nsamples (`int`, defaults to `128`): @@ -247,11 +218,13 @@ def get_dataset( "wikitext2": get_wikitext2, "c4": get_c4, "c4-new": get_c4_new, - "ptb": get_ptb, - "ptb-new": get_ptb_new, } if split not in ["train", "validation"]: raise ValueError(f"The split need to be 'train' or 'validation' but found {split}") + if dataset_name in {"ptb", "ptb-new"}: + raise ValueError( + f"{dataset_name} dataset was deprecated, only the following dataset are supported : {list(get_dataset_map)}" + ) if dataset_name not in get_dataset_map: raise ValueError(f"Expected a value in {list(get_dataset_map.keys())} but found {dataset_name}") get_dataset_fn = get_dataset_map[dataset_name] diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 0c070f8c9e4..5ed1619fde3 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -394,7 +394,7 @@ class GPTQDataTest(unittest.TestCase): def setUp(self): self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) - @parameterized.expand(["wikitext2", "c4", "ptb", "c4-new", "ptb-new"]) + @parameterized.expand(["wikitext2", "c4", "c4-new"]) def test_dataset(self, dataset): train_dataset = get_dataset( dataset, self.tokenizer, nsamples=self.NBSAMPLES, seqlen=self.SEQLEN, split="train" From aad4b8beff3194af2679f762e2097113943c9f07 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 24 Jun 2024 11:13:47 +0100 Subject: [PATCH 31/31] Fix Windows and onnx dtype compatibility (#1886) * fix pkv and audio * add t5 test * fix seq2seq * fix vision2seq tests as it seems to have had always outputed kv cache in torch format before * fix folder deletion on windows * fix temporary directory removal on windows * remove attention_mask creation as ORTModelForxxx's corresponding processors will create it * remove_directory utility function --- optimum/onnxruntime/base.py | 124 ++---- optimum/onnxruntime/modeling_decoder.py | 73 ++-- optimum/onnxruntime/modeling_ort.py | 515 +++++++++--------------- optimum/utils/testing_utils.py | 14 + tests/onnxruntime/test_modeling.py | 58 +-- 5 files changed, 299 insertions(+), 485 deletions(-) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index bf9c80a86cd..16461dce957 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -14,7 +14,7 @@ """Defines the base classes that are used to perform inference with ONNX Runtime of Transformers models.""" from abc import abstractmethod -from typing import TYPE_CHECKING, Dict, Optional, Set, Tuple, Union +from typing import Dict, Optional, Set, Tuple, Union import numpy as np import torch @@ -24,22 +24,22 @@ from ..utils import NormalizedConfigManager from ..utils.logging import warn_once +from .modeling_ort import ORTModel from .utils import get_ordered_input_names, logging logger = logging.get_logger(__name__) -if TYPE_CHECKING: - from .modeling_ort import ORTModel - - class ORTModelPart: """ For multi-file ONNX models, such as encoder-decoder models, represents a part of the model. It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. """ + _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs + _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs + def __init__( self, session: InferenceSession, @@ -53,6 +53,8 @@ def __init__( self.main_input_name = self.parent_model.main_input_name self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) @@ -98,25 +100,13 @@ def forward( last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - if use_torch: - onnx_inputs = {"input_ids": input_ids.cpu().detach().numpy()} - - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask.cpu().detach().numpy() - else: - onnx_inputs = {"input_ids": input_ids} + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # Run inference - outputs = self.session.run(None, onnx_inputs) - - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + last_hidden_state = model_outputs["last_hidden_state"] return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -350,83 +340,29 @@ def forward( else: raise ValueError("Unsupported num_pkv") else: - if use_torch: - onnx_inputs = { - "input_ids": input_ids.cpu().detach().numpy(), - } - - # Add the encoder_hidden_states inputs when needed - if "encoder_hidden_states" in self.input_names: - onnx_inputs["encoder_hidden_states"] = encoder_hidden_states.cpu().detach().numpy() - - # Add the decoder_attention_mask inputs when needed - if "decoder_attention_mask" in self.input_names: - onnx_inputs["decoder_attention_mask"] = decoder_attention_mask.cpu().detach().numpy() - - # Add the encoder_attention_mask inputs when needed - if "encoder_attention_mask" in self.input_names: - onnx_inputs["encoder_attention_mask"] = encoder_attention_mask.cpu().detach().numpy() - - if past_key_values is not None: - # Add the past_key_values to the decoder inputs - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - onnx_inputs[input_name] = past_key_value.cpu().detach().numpy() - - if "labels" in self.input_names: - # TODO: Any preprocessing like `self._shift_right(labels)`? - onnx_inputs["labels"] = labels.cpu().detach().numpy() - - if self.parent_model.use_merged is True: - onnx_inputs["use_cache_branch"] = use_cache_branch_tensor.cpu().detach().numpy() - else: - onnx_inputs = { - "input_ids": input_ids, - } - - # Add the encoder_hidden_states inputs when needed - if "encoder_hidden_states" in self.input_names: - onnx_inputs["encoder_hidden_states"] = encoder_hidden_states - - # Add the decoder_attention_mask inputs when needed - if "decoder_attention_mask" in self.input_names: - onnx_inputs["decoder_attention_mask"] = decoder_attention_mask - - # Add the encoder_attention_mask inputs when needed - if "encoder_attention_mask" in self.input_names: - onnx_inputs["encoder_attention_mask"] = encoder_attention_mask - - if past_key_values is not None: - # Add the past_key_values to the decoder inputs - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - onnx_inputs[input_name] = past_key_value - - if "labels" in self.input_names: - # TODO: Any preprocessing like `self._shift_right(labels)`? - onnx_inputs["labels"] = labels - - if self.parent_model.use_merged is True: - onnx_inputs["use_cache_branch"] = use_cache_branch_tensor + model_inputs = { + "input_ids": input_ids, + "encoder_hidden_states": encoder_hidden_states, + "decoder_attention_mask": decoder_attention_mask, + "encoder_attention_mask": encoder_attention_mask, + "use_cache_branch": use_cache_branch_tensor, + "labels": labels, + } + if past_key_values is not None: + model_inputs.update(zip(self.key_value_input_names, past_key_values)) - # Run inference - outputs = self.session.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # TODO: using two loops here is probably unefficient + # TODO: using a new variable out_past_key_values is memory inefficient, + # past_key_values is not used anymore at this point # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the # self-attention layer and 2 to the cross-attention layer) - out_past_key_values = tuple( - torch.from_numpy(outputs[self.output_names[key]]).to(self.device) - for key in self.key_value_output_names - ) - - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + out_past_key_values = tuple(model_outputs[output_name] for output_name in self.key_value_output_names) - loss = None - if "loss" in self.output_names: - loss = outputs[self.output_names["loss"]] - if use_torch: - loss = torch.from_numpy(loss).to(self.device) + loss = model_outputs.get("loss", None) + logits = model_outputs["logits"] # TODO: this is extremely ugly and unreadable. What if cross-attention k/v change? # Tuple of tuple of length `n_layers`, with each tuple of length equal to: diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 2d9be2d757f..5d4bbe184e1 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -46,7 +46,7 @@ if check_if_transformers_greater("4.25.0"): from transformers.generation import GenerationMixin else: - from transformers.generation_utils import GenerationMixin + from transformers.generation_utils import GenerationMixin # type: ignore # noqa: F401 logger = logging.getLogger(__name__) @@ -139,15 +139,16 @@ def __init__( self.num_pkv = 2 self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) - self.key_value_input_names = [key for key in self.inputs_names if (".key" in key) or (".value" in key)] + self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)] self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)] self.use_cache = len(self.key_value_input_names) > 0 if generation_config is None: generation_config = GenerationConfig.from_model_config(config) + self.generation_config = generation_config self.onnx_paths = [self.model_path] - self.use_merged = "use_cache_branch" in self.inputs_names + self.use_merged = "use_cache_branch" in self.input_names self.model_type = self.config.model_type self.use_fp16 = False @@ -160,7 +161,7 @@ def __init__( # Reference: https://github.com/huggingface/optimum/pull/1381 model_type = config.model_type.replace("_", "-") - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.inputs_names: + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.input_names: logger.warning( f"ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture {model_type}. " "We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support." @@ -202,7 +203,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - inputs = {} known_output_shapes = {} use_cache_branch = None loss = None @@ -226,10 +226,10 @@ def forward( # I suspect the reason is the contiguous python list that messes something up? model_inputs = [input_ids.contiguous()] - if "attention_mask" in self.inputs_names: + if "attention_mask" in self.input_names: model_inputs.append(attention_mask) - if "position_ids" in self.inputs_names: + if "position_ids" in self.input_names: if position_ids is None: raise ValueError("position_ids was not passed but is a required input for this ONNX model.") model_inputs.append(position_ids.contiguous()) @@ -240,12 +240,11 @@ def forward( if use_cache_branch is not None: model_inputs.append(use_cache_branch) - if "labels" in self.inputs_names: + if "labels" in self.input_names: model_inputs.append(labels) known_output_shapes.update({"loss": []}) - io_binding, output_shapes, output_buffers = self._prepare_io_binding( - self.model, + io_binding, output_shapes, output_buffers = self.prepare_io_binding( *model_inputs, known_output_shapes=known_output_shapes, ordered_input_names=self._ordered_input_names, @@ -259,53 +258,41 @@ def forward( io_binding.synchronize_outputs() if self.use_cache: - # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2) - past_key_values = () - for name in self.key_value_output_names: - past_key_values += (output_buffers[name].view(output_shapes[name]),) + # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2 for the self-attention) + past_key_values = tuple( + output_buffers[name].view(output_shapes[name]) for name in self.key_value_output_names + ) logits = output_buffers["logits"].view(output_shapes["logits"]) if "loss" in self.output_names: loss = output_buffers["loss"].view(output_shapes["loss"]) else: - inputs["input_ids"] = input_ids.cpu().detach().numpy() if use_torch else input_ids - - if "attention_mask" in self.inputs_names: - inputs["attention_mask"] = attention_mask.cpu().detach().numpy() if use_torch else attention_mask - - if "labels" in self.inputs_names: - inputs["labels"] = labels.cpu().detach().numpy() if use_torch else labels - - if "position_ids" in self.inputs_names: - if position_ids is None: - raise ValueError("position_ids was not passed but is a required input for this ONNX model.") - inputs["position_ids"] = position_ids.cpu().detach().numpy() if use_torch else position_ids - - # Add the past_key_values to the decoder inputs + model_inputs = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "use_cache_branch": use_cache_branch, + "labels": labels, + } if past_key_values is not None: - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - inputs[input_name] = past_key_value.cpu().detach().numpy() if use_torch else past_key_value + model_inputs.update( + zip(self.key_value_input_names, past_key_values), + ) - if use_cache_branch is not None: - inputs["use_cache_branch"] = use_cache_branch.cpu().detach().numpy() if use_torch else use_cache_branch + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - outputs = self.model.run(None, inputs) + loss = model_outputs.get("loss", None) + logits = model_outputs["logits"] if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 for the self-attention) - past_key_values = tuple( - torch.from_numpy(outputs[self.output_names[key]]).to(self.device) - for key in self.key_value_output_names - ) - - logits = torch.from_numpy(outputs[self.output_names["logits"]]).to(self.device) - if "loss" in self.output_names: - loss = torch.from_numpy(outputs[self.output_names["loss"]]).to(self.device) + past_key_values = tuple(model_outputs[output_name] for output_name in self.key_value_output_names) if self.use_cache and self.model_type != "gpt_bigcode": - # Tuple of tuple of length `n_layers`, with each tuple of length equal to the number of self-attention and - # per decoder layer + # Tuple of tuple of length `n_layers`, with each tuple of length equal to the number of self-attention and per decoder layer past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) ) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index b65e1d3b29a..734c9b65515 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -267,10 +267,13 @@ def __init__( **kwargs, ) - self.inputs_names = {input_key.name: idx for idx, input_key in enumerate(model.get_inputs())} + self.input_names = {input_key.name: idx for idx, input_key in enumerate(model.get_inputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in model.get_inputs()} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(model.get_outputs())} + self.output_dtypes = {output_key.name: output_key.type for output_key in model.get_outputs()} - self._ordered_input_names = get_ordered_input_names(self.inputs_names.keys(), func=self.forward) + self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) # TODO: why do we make device a property since we are only access the value, and do not do any check when setting the value? @property @@ -736,6 +739,7 @@ def _output_shape_inference(self, axis_name: Union[str, int], dimensions: Dict[s # exception. return int(eval(" ".join(tokens))) + # TODO: this method is bloated with state arguments (that are accesible using self) why ? def _prepare_io_binding( self, model: ort.InferenceSession, @@ -833,9 +837,15 @@ def _prepare_io_binding( return io_binding, output_shapes, output_buffers - def prepare_io_binding(self, *model_inputs, ordered_input_names, known_output_shapes=None): + def prepare_io_binding( + self, *model_inputs, ordered_input_names, outputs_to_not_bind=None, known_output_shapes=None + ): return self._prepare_io_binding( - self.model, ordered_input_names=ordered_input_names, known_output_shapes=known_output_shapes, *model_inputs + self.model, + *model_inputs, + ordered_input_names=ordered_input_names, + known_output_shapes=known_output_shapes, + outputs_to_not_bind=outputs_to_not_bind, ) def raise_on_numpy_input_io_binding(self, use_torch: bool): @@ -852,6 +862,39 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool): " with model.use_io_binding = False, or pass torch.Tensor inputs instead." ) + def _prepare_onnx_inputs( + self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray] + ) -> Dict[str, np.ndarray]: + onnx_inputs = {} + + # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): + onnx_inputs[input_name] = inputs.pop(input_name) + + if use_torch: + onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy() + + if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: + onnx_inputs[input_name] = onnx_inputs[input_name].astype( + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + ) + + return onnx_inputs + + def _prepare_onnx_outputs( + self, use_torch: bool, *onnx_outputs: np.ndarray + ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + model_outputs = {} + + # converts onnxruntime outputs into tensor for standard outputs + for output_name, idx in self.output_names.items(): + model_outputs[output_name] = onnx_outputs[idx] + + if use_torch: + model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device) + + return model_outputs + @staticmethod def _cached_file( model_path: Union[Path, str], @@ -970,9 +1013,6 @@ def forward( self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, attention_mask, @@ -985,35 +1025,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return BaseModelOutput( - last_hidden_state=output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) - ) + last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - if attention_mask is None: - attention_mask = np.ones_like(input_ids) - else: - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + # TODO: why do we only return last_hidden_state? why not all outputs? + # that way, there will be less need for ORTModelForCustomTask in cases where + # we just want to extend model outputs with attentions, hidden_states, etc. + last_hidden_state = model_outputs["last_hidden_state"] - # converts output to namedtuple for pipelines post-processing - return BaseModelOutput(last_hidden_state=last_hidden_state) + # converts output to namedtuple for pipelines post-processing + return BaseModelOutput(last_hidden_state=last_hidden_state) @classmethod def _export( @@ -1144,32 +1170,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return MaskedLMOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return MaskedLMOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return MaskedLMOutput(logits=logits) QUESTION_ANSWERING_EXAMPLE = r""" @@ -1247,37 +1259,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return QuestionAnsweringModelOutput( - start_logits=output_buffers["start_logits"].view(output_shapes["start_logits"]), - end_logits=output_buffers["end_logits"].view(output_shapes["end_logits"]), - ) + # TODO: this is the same routine in all io binding branches, should we refactor it into a prepare_io_binding_outputs method? + start_logits = output_buffers["start_logits"].view(output_shapes["start_logits"]) + end_logits = output_buffers["end_logits"].view(output_shapes["end_logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - - start_logits = outputs[self.output_names["start_logits"]] - end_logits = outputs[self.output_names["end_logits"]] - if use_torch: - start_logits = torch.from_numpy(start_logits).to(self.device) - end_logits = torch.from_numpy(end_logits).to(self.device) + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - # converts output to namedtuple for pipelines post-processing - return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + start_logits = model_outputs["start_logits"] + end_logits = model_outputs["end_logits"] + + # converts output to namedtuple for pipelines post-processing + return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) SEQUENCE_CLASSIFICATION_EXAMPLE = r""" @@ -1370,30 +1366,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return SequenceClassifierOutput(logits=logits) TOKEN_CLASSIFICATION_EXAMPLE = r""" @@ -1472,32 +1456,17 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=logits) + return TokenClassifierOutput(logits=logits) MULTIPLE_CHOICE_EXAMPLE = r""" @@ -1570,31 +1539,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return MultipleChoiceModelOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return MultipleChoiceModelOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return MultipleChoiceModelOutput(logits=logits) IMAGE_CLASSIFICATION_EXAMPLE = r""" @@ -1662,7 +1618,8 @@ def forward( if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - pixel_values, ordered_input_names=self._ordered_input_names + pixel_values, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1670,25 +1627,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return ImageClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - pixel_values = pixel_values.cpu().detach().numpy() + model_inputs = {"pixel_values": pixel_values} - onnx_inputs = { - "pixel_values": pixel_values, - } - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return ImageClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return ImageClassifierOutput(logits=logits) SEMANTIC_SEGMENTATION_EXAMPLE = r""" @@ -1755,47 +1705,28 @@ def forward( self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - io_binding = IOBindingHelper.prepare_io_binding( - self, + io_binding, output_shapes, output_buffers = self.prepare_io_binding( pixel_values, - **kwargs, ordered_input_names=self._ordered_input_names, ) - # run inference with binding + # run inference with binding & synchronize in case of multiple CUDA streams io_binding.synchronize_inputs() self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} - for name, output in zip(self.output_names.keys(), io_binding._iobinding.get_outputs()): - outputs[name] = IOBindingHelper.to_pytorch(output) - - # converts output to namedtuple for pipelines post-processing - return SemanticSegmenterOutput(logits=outputs["logits"]) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, pixel_values=pixel_values, **kwargs) + model_inputs = {"pixel_values": pixel_values} - # run inference + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = onnx_outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - - # converts output to namedtuple for pipelines post-processing - return SemanticSegmenterOutput(logits=logits) - - def _prepare_onnx_inputs(self, use_torch: bool, **kwargs): - onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx - for input in self.inputs_names.keys(): - onnx_inputs[input] = kwargs.pop(input) - - if use_torch: - onnx_inputs[input] = onnx_inputs[input].cpu().detach().numpy() + logits = model_outputs["logits"] - return onnx_inputs + # converts output to namedtuple for pipelines post-processing + return SemanticSegmenterOutput(logits=logits) AUDIO_CLASSIFICATION_EXAMPLE = r""" @@ -1883,18 +1814,28 @@ def __init__( ) def forward( self, - input_values: Optional[torch.Tensor] = None, - attenton_mask: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, + attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None, + input_features: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): - if input_values is None: - # Whisper uses input_features and not input_values. - input_values = kwargs["input_features"] - use_torch = isinstance(input_values, torch.Tensor) + if self.input_name == "input_features": + assert input_features is not None, "input_features must be provided for this model" + model_input = input_features + elif self.input_name == "input_values": + assert input_values is not None, "input_values must be provided for this model" + model_input = input_values + else: + raise ValueError(f"Input {self.input_name} not supported for Audio Classification") + + use_torch = isinstance(model_input, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, ordered_input_names=self._ordered_input_names + model_input, + attention_mask, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1902,28 +1843,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - self.input_name: input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - self.input_name: input_values, - } + model_inputs = {self.input_name: model_input, "attention_mask": attention_mask} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return SequenceClassifierOutput(logits=logits) CTC_EXAMPLE = r""" @@ -1971,11 +1902,12 @@ class ORTModelForCTC(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: input_size = input_values.shape[1] output_sizes = [] @@ -1990,9 +1922,7 @@ def _conv_output_size(input_size, kernel_size, stride): known_output_shapes = {"logits": [input_values.shape[0], output_sizes[-1], self.config.vocab_size]} io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, - ordered_input_names=self._ordered_input_names, - known_output_shapes=known_output_shapes, + input_values, ordered_input_names=self._ordered_input_names, known_output_shapes=known_output_shapes ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -2000,28 +1930,18 @@ def _conv_output_size(input_size, kernel_size, stride): self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} - - return CausalLMOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - # converts output to namedtuple for pipelines post-processing - return CausalLMOutput(logits=logits) + logits = model_outputs["logits"] + + # converts output to namedtuple for pipelines post-processing + return CausalLMOutput(logits=logits) AUDIO_XVECTOR_EXAMPLE = r""" @@ -2077,11 +1997,12 @@ class ORTModelForAudioXVector(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_values, ordered_input_names=self._ordered_input_names @@ -2092,33 +2013,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return XVectorOutput( - logits=output_buffers["logits"].view(output_shapes["logits"]), - embeddings=output_buffers["embeddings"].view(output_shapes["embeddings"]), - ) + logits = output_buffers["logits"].view(output_shapes["logits"]) + embeddings = output_buffers["embeddings"].view(output_shapes["embeddings"]) + else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - embeddings = outputs[self.output_names["embeddings"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - embeddings = torch.from_numpy(embeddings).to(self.device) + logits = model_outputs["logits"] + embeddings = model_outputs["embeddings"] - # converts output to namedtuple for pipelines post-processing - return XVectorOutput(logits=logits, embeddings=embeddings) + # converts output to namedtuple for pipelines post-processing + return XVectorOutput(logits=logits, embeddings=embeddings) AUDIO_FRAME_CLASSIFICATION_EXAMPLE = r""" @@ -2166,7 +2075,7 @@ class ORTModelForAudioFrameClassification(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) @@ -2175,24 +2084,16 @@ def forward( if self.device.type == "cuda" and self.use_io_binding: raise NotImplementedError() else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=logits) + logits = model_outputs["logits"] + + # converts output to namedtuple for pipelines post-processing + return TokenClassifierOutput(logits=logits) CUSTOM_TASKS_EXAMPLE = r""" @@ -2241,57 +2142,27 @@ class ORTModelForCustomTasks(ORTModel): checkpoint="optimum/sbert-all-MiniLM-L6-with-pooler", ) ) - def forward(self, **kwargs): - use_torch = isinstance(next(iter(kwargs.values())), torch.Tensor) + def forward(self, **model_inputs: Union[torch.Tensor, np.ndarray]): + use_torch = isinstance(next(iter(model_inputs.values())), torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - io_binding = IOBindingHelper.prepare_io_binding( - self, - **kwargs, - ordered_input_names=self._ordered_input_names, - ) + # TODO: should this be used in favor of `model.prepare_io_binding`? + io_binding = IOBindingHelper.prepare_io_binding(self, **model_inputs) # run inference with binding io_binding.synchronize_inputs() self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} + model_outputs = {} for name, output in zip(self.output_names.keys(), io_binding._iobinding.get_outputs()): - outputs[name] = IOBindingHelper.to_pytorch(output) + model_outputs[name] = IOBindingHelper.to_pytorch(output) - # converts output to namedtuple for pipelines post-processing - return ModelOutput(**outputs) else: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, **kwargs) - - # run inference + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - outputs = self._prepare_onnx_outputs(onnx_outputs, use_torch=use_torch) - - # converts output to namedtuple for pipelines post-processing - return ModelOutput(outputs) - - def _prepare_onnx_inputs(self, use_torch: bool, **kwargs): - onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx - for input in self.inputs_names.keys(): - onnx_inputs[input] = kwargs.pop(input) - - if use_torch: - onnx_inputs[input] = onnx_inputs[input].cpu().detach().numpy() - - return onnx_inputs - - def _prepare_onnx_outputs(self, onnx_outputs, use_torch: bool): - outputs = {} - # converts onnxruntime outputs into tensor for standard outputs - for output, idx in self.output_names.items(): - outputs[output] = onnx_outputs[idx] - - if use_torch: - outputs[output] = torch.from_numpy(outputs[output]).to(self.device) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - return outputs + # converts output to namedtuple for pipelines post-processing + return ModelOutput(**model_outputs) diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index a7c2b8bb050..41bd140d862 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -16,6 +16,7 @@ import importlib.util import itertools import os +import shutil import subprocess import sys import unittest @@ -181,3 +182,16 @@ def grid_parameters( else: returned_list = [test_name] + list(params) if add_test_name is True else list(params) yield returned_list + + +def remove_directory(dirpath): + """ + Remove a directory and its content. + This is a cross-platform solution to remove a directory and its content that avoids the use of `shutil.rmtree` on Windows. + Reference: https://github.com/python/cpython/issues/107408 + """ + if os.path.exists(dirpath) and os.path.isdir(dirpath): + if os.name == "nt": + os.system(f"rmdir /S /Q {dirpath}") + else: + shutil.rmtree(dirpath) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 7b2c8a66b9e..6c88fddb40f 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -14,7 +14,6 @@ # limitations under the License. import gc import os -import shutil import subprocess import tempfile import time @@ -109,7 +108,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.testing_utils import grid_parameters, require_hf_token, require_ort_rocm +from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm logger = logging.get_logger() @@ -184,9 +183,8 @@ def test_load_model_from_cache(self): def test_load_model_from_empty_cache(self): dirpath = os.path.join(default_cache_path, "models--" + self.TINY_ONNX_MODEL_ID.replace("/", "--")) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTModel.from_pretrained(self.TINY_ONNX_MODEL_ID, local_files_only=True) @@ -202,9 +200,8 @@ def test_load_seq2seq_model_from_cache(self): def test_load_seq2seq_model_from_empty_cache(self): dirpath = os.path.join(default_cache_path, "models--" + self.TINY_ONNX_SEQ2SEQ_MODEL_ID.replace("/", "--")) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True) @@ -225,9 +222,8 @@ def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--") ) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True @@ -1008,6 +1004,7 @@ def test_save_load_ort_model_with_external_data(self): # verify loading from local folder works model = ORTModelForSequenceClassification.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) @pytest.mark.run_slow @@ -1015,11 +1012,7 @@ def test_save_load_ort_model_with_external_data(self): def test_save_load_decoder_model_with_external_data(self, use_cache: bool): with tempfile.TemporaryDirectory() as tmpdirname: model = ORTModelForCausalLM.from_pretrained( - "gpt2-large", - use_cache=use_cache, - export=True, - use_merged=False, - use_io_binding=False, + "gpt2-large", use_cache=use_cache, export=True, use_merged=False, use_io_binding=False ) model.save_pretrained(tmpdirname) @@ -1033,6 +1026,7 @@ def test_save_load_decoder_model_with_external_data(self, use_cache: bool): model = ORTModelForCausalLM.from_pretrained( tmpdirname, use_cache=use_cache, export=False, use_io_binding=False ) + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): @@ -1055,6 +1049,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): # verify loading from local folder works model = ORTModelForSeq2SeqLM.from_pretrained(tmpdirname, use_cache=use_cache, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: @@ -1076,6 +1071,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self): # verify loading from local folder works model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) @unittest.skip("Skipping as this test consumes too much memory") @@ -2278,6 +2274,8 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): @parameterized.expand([(False,), (True,)]) @pytest.mark.run_in_series + # TODO: still gotta find out why this needs to be ran in series / why it fails in parallel + # my guess is that the model surgery is happening in parallel and that's causing the issue def test_inference_old_onnx_model(self, use_cache): tokenizer = get_preprocessor("gpt2") model = AutoModelForCausalLM.from_pretrained("gpt2") @@ -2290,9 +2288,9 @@ def test_inference_old_onnx_model(self, use_cache): tokens = tokenizer(text, return_tensors="pt") onnx_outputs = onnx_model.generate( - **tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10 + **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30 ) - outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10) + outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30) onnx_text_outputs = tokenizer.decode(onnx_outputs[0], skip_special_tokens=True) text_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True) self.assertEqual(onnx_text_outputs, text_outputs) @@ -3605,13 +3603,20 @@ def _get_onnx_model_dir(self, model_id, model_arch, test_name): @pytest.mark.run_in_series def test_inference_old_onnx_model(self): - model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") + tokenizer = get_preprocessor("t5-small") + model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") + onnx_model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") - tokenizer = get_preprocessor("optimum/t5-small") text = "This is a sample output" tokens = tokenizer(text, return_tensors="pt") - model.generate(**tokens) + outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30) + onnx_outputs = onnx_model.generate( + **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30 + ) + onnx_text_outputs = tokenizer.decode(onnx_outputs[0], skip_special_tokens=True) + text_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True) + self.assertEqual(onnx_text_outputs, text_outputs) def test_load_vanilla_transformers_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -4760,6 +4765,9 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertTrue("logits" in onnx_outputs) self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue( + torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3) + ) if use_cache: self.assertEqual( @@ -4768,19 +4776,17 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertEqual( len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0]) ) - for i, _ in enumerate(onnx_outputs["past_key_values"]): - for j, ort_pkv in enumerate(onnx_outputs["past_key_values"][i]): - trfs_pkv = transformers_outputs["past_key_values"][i][j] + for i in range(len(onnx_outputs["past_key_values"])): + print(onnx_outputs["past_key_values"][i]) + for ort_pkv, trfs_pkv in zip( + onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i] + ): + ort_pkv = torch.Tensor(ort_pkv) self.assertTrue( torch.allclose(ort_pkv, trfs_pkv, atol=1e-3), f" Maxdiff: {torch.abs(ort_pkv - trfs_pkv).max()}", ) - # Compare tensor outputs - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3) - ) - gc.collect() @parameterized.expand(grid_parameters(FULL_GRID))