From fdbd12f07232d71d59bf42b6c54b975e687d9827 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 23 Oct 2024 13:36:24 +0200 Subject: [PATCH 01/33] transformers 4.46 --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fb290274a3b..753f14145f5 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29", + "transformers @ git+https://github.com/huggingface/transformers.git", + # "transformers[sentencepiece]>=4.29", "torch>=1.11", "packaging", "numpy", From 1583017d7a678ca2e3544da0911a3d40b8f1f772 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 23 Oct 2024 13:39:36 +0200 Subject: [PATCH 02/33] setup --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 753f14145f5..bb1667a09ff 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.46.0", + "transformers<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -64,10 +64,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.46.0", + "transformers<4.47.0", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.47.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.47.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", From a4c3aa6001c609bd617d0c5d7497c111abc0d340 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 23 Oct 2024 13:46:49 +0200 Subject: [PATCH 03/33] uupdate setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb1667a09ff..7d6e926461b 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers[sentencepiece]>=4.26,<4.38", + # "transformers[sentencepiece]>=4.26,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", From f0a4b7a201385fe48269640d4fff4482645da183 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 14:13:00 +0200 Subject: [PATCH 04/33] fix t5 --- optimum/exporters/onnx/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e77f649f69b..d2ee35dbade 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -480,7 +480,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class T5OnnxConfig(TextSeq2SeqOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 # T5 uses aten::triu that requires opset>=14 DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES[:-1] + ( T5DummySeq2SeqPastKeyValuesGenerator, ) From ae08d8cc4c8098e6c28644f88cea4f45ca306475 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 14:19:43 +0200 Subject: [PATCH 05/33] update python (3.8 eol) --- .github/workflows/check_code_quality.yml | 2 +- .github/workflows/test_benckmark.yml | 30 ++++++------- .github/workflows/test_cli.yml | 2 +- .github/workflows/test_export_onnx.yml | 44 +++++++++---------- .github/workflows/test_export_onnx_cli.yml | 30 ++++++------- .../workflows/test_export_onnx_cli_timm.yml | 26 +++++------ .github/workflows/test_export_onnx_timm.yml | 27 ++++++------ .github/workflows/test_exporters_common.yml | 2 +- .github/workflows/test_exporters_slow.yml | 2 +- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_offline.yml | 2 +- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 2 +- .github/workflows/test_onnxruntime_slow.yml | 2 +- .github/workflows/test_optimum_common.yml | 39 ++++++++-------- .github/workflows/test_utils.yml | 2 +- 16 files changed, 107 insertions(+), 109 deletions(-) diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml index c429b706bff..861684cfa4d 100644 --- a/.github/workflows/check_code_quality.yml +++ b/.github/workflows/check_code_quality.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index 7f7f2ace329..e859e845d64 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -4,9 +4,9 @@ name: Benchmark suite / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,20 +17,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install wheel - pip install .[tests,onnxruntime,benchmark] - - name: Test with unittest - run: | - python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install wheel + pip install .[tests,onnxruntime,benchmark] + - name: Test with unittest + run: | + python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index ecb19d23aa3..be000caf6a3 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml index 56ef674cb41..0cd19a1724c 100644 --- a/.github/workflows/test_export_onnx.yml +++ b/.github/workflows/test_export_onnx.yml @@ -2,9 +2,9 @@ name: Exporters ONNX / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,27 +15,27 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 - - name: Install dependencies for tensorflow export - run: | - pip install .[tests,exporters-tf] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 + - name: Install dependencies for tensorflow export + run: | + pip install .[tests,exporters-tf] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml index 8fa4ebb045f..618a140c147 100644 --- a/.github/workflows/test_export_onnx_cli.yml +++ b/.github/workflows/test_export_onnx_cli.yml @@ -2,9 +2,9 @@ name: Exporters ONNX CLI / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,20 +15,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli_timm.yml b/.github/workflows/test_export_onnx_cli_timm.yml index 76a535fcebd..b92d5551ba1 100644 --- a/.github/workflows/test_export_onnx_cli_timm.yml +++ b/.github/workflows/test_export_onnx_cli_timm.yml @@ -14,20 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_timm.yml b/.github/workflows/test_export_onnx_timm.yml index 339e3e93dec..c16d20fbc18 100644 --- a/.github/workflows/test_export_onnx_timm.yml +++ b/.github/workflows/test_export_onnx_timm.yml @@ -14,21 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml index 8e8c3360c1f..11f6038afe4 100644 --- a/.github/workflows/test_exporters_common.yml +++ b/.github/workflows/test_exporters_common.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml index b22fdd7fd2a..453389d63fa 100644 --- a/.github/workflows/test_exporters_slow.yml +++ b/.github/workflows/test_exporters_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index f0366cf0d1e..a4e6dd3cd29 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml index 90b0108e512..20911fe6db8 100644 --- a/.github/workflows/test_offline.yml +++ b/.github/workflows/test_offline.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 22a11720798..dd1f3bee63d 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index a72bedb1ab7..7bbe00bd157 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index 20371f79150..c5679e5b307 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index ded149c9b69..5ad42807a5f 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -4,9 +4,9 @@ name: Optimum common / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,25 +17,24 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install .[tests] - ls -l optimum/ - - name: Test with unittest - shell: bash - run: | - # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. - export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} - pytest tests/test_*.py - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[tests] + ls -l optimum/ + - name: Test with unittest + shell: bash + run: | + # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. + export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} + pytest tests/test_*.py diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 1ef33ced086..b5f2e27fc6a 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-13] - python-version: [3.8, 3.9] + python-version: [3.9] runs-on: ${{ matrix.os }} steps: From 0096598842d9086cf39221607fdb57880f7a5390 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 14:43:56 +0200 Subject: [PATCH 06/33] fix onnx test --- optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 16 ++++++++++++++++ setup.py | 1 - tests/onnx/test_onnx_export_custom_module.py | 17 +++++++++++++++-- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 5d5044e63e1..db7d1f6975d 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -29,6 +29,7 @@ TRANSFORMERS_MINIMUM_VERSION, check_if_diffusers_greater, check_if_pytorch_greater, + check_if_torch_greater, check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 4a57fda79ce..7dace56d5dd 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -193,6 +193,22 @@ def check_if_diffusers_greater(target_version: str) -> bool: return version.parse(_diffusers_version) >= version.parse(target_version) +def check_if_torch_greater(target_version: str) -> bool: + """ + Checks whether the current install of torch is greater than or equal to the target version. + + Args: + target_version (str): version used as the reference for comparison. + + Returns: + bool: whether the check is True or not. + """ + if not is_torch_available(): + return False + + return version.parse(torch_version) >= version.parse(target_version) + + @contextmanager def require_numpy_strictly_lower(package_version: str, message: str): if not version.parse(np.__version__) < version.parse(package_version): diff --git a/setup.py b/setup.py index 7d6e926461b..40ed309aca0 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ "diffusers>=0.17.0", "torchaudio", "einops", - "invisible-watermark", "timm", "scikit-learn", "rjieba", diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py index a144d5cd840..4398c14f01d 100644 --- a/tests/onnx/test_onnx_export_custom_module.py +++ b/tests/onnx/test_onnx_export_custom_module.py @@ -24,6 +24,8 @@ import torch from transformers.models.deberta import modeling_deberta + from optimum.utils import check_if_torch_greater + class StableDropoutTestCase(TestCase): """Tests export of StableDropout module.""" @@ -50,8 +52,8 @@ def test_training(self): training=training, ) - # Expected to fail with opset_version < 12 - with self.assertRaises(Exception): + if check_if_torch_greater("2.5"): + # Expected to pass with opset_version < 12 on torch >= 2.5 torch.onnx.export( sd, input, @@ -60,3 +62,14 @@ def test_training(self): do_constant_folding=do_constant_folding, training=training, ) + else: + # Expected to fail with opset_version < 12 on torch < 2.5 + with self.assertRaises(Exception): + torch.onnx.export( + sd, + input, + devnull, + opset_version=11, + do_constant_folding=do_constant_folding, + training=training, + ) From 9435122e04569d165e2c858c4c39a86fc28fec82 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 14:51:47 +0200 Subject: [PATCH 07/33] fixed deberta, onnxruntime tests in series passing --- optimum/exporters/onnx/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d2ee35dbade..5d7c29b4f45 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -155,7 +155,7 @@ class SplinterOnnxConfig(BertOnnxConfig): class DistilBertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0 @property def inputs(self) -> Dict[str, Dict[int, str]]: From d25cd97fa835be020de29fa69ab5a0638e9862ed Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 16:04:50 +0200 Subject: [PATCH 08/33] fix bt --- optimum/bettertransformer/transformation.py | 36 ++++++++++++++----- optimum/utils/import_utils.py | 2 +- setup.py | 1 + tests/bettertransformer/test_audio.py | 20 ++++++----- tests/bettertransformer/test_common.py | 12 ++++--- tests/bettertransformer/test_decoder.py | 6 ++-- tests/bettertransformer/test_encoder.py | 4 ++- .../bettertransformer/test_encoder_decoder.py | 3 +- tests/bettertransformer/test_gpu.py | 4 ++- tests/bettertransformer/testing_utils.py | 18 +++++----- 10 files changed, 68 insertions(+), 38 deletions(-) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index a101757b6fa..b138862752e 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -20,7 +20,13 @@ import torch from packaging.version import parse -from ..utils import check_if_pytorch_greater, is_accelerate_available, recurse_getattr, recurse_setattr +from ..utils import ( + check_if_pytorch_greater, + check_if_torch_greater, + is_accelerate_available, + recurse_getattr, + recurse_setattr, +) from .models import BetterTransformerManager @@ -213,15 +219,18 @@ def transform( hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( - f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention." + f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." ) - # Check if we have to load the model using `accelerate` - if hasattr(model, "hf_device_map"): - load_accelerate = True - hf_device_map = model.hf_device_map - else: - load_accelerate = False + if hasattr(hf_config, "_attn_implementation") and hf_config._attn_implementation == "sdpa": + raise ValueError( + "This model already uses BetterTransformer optimizations from Transformers (torch.nn.functional.scaled_dot_product_attention). " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." + ) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: raise Exception( @@ -241,11 +250,20 @@ def transform( f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}." ) - if parse(torch.__version__) <= parse("1.14"): + if not check_if_torch_greater("2.0"): raise ValueError( f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch." ) + hf_config = model.config + + # Check if we have to load the model using `accelerate` + if hasattr(model, "hf_device_map"): + load_accelerate = True + hf_device_map = model.hf_device_map + else: + load_accelerate = False + if load_accelerate: # Remove the hooks from the original model to avoid weights being on `meta` device. remove_hook_from_module(model, recurse=True) diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 7dace56d5dd..35a6294ab52 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -206,7 +206,7 @@ def check_if_torch_greater(target_version: str) -> bool: if not is_torch_available(): return False - return version.parse(torch_version) >= version.parse(target_version) + return torch_version >= version.parse(target_version) @contextmanager diff --git a/setup.py b/setup.py index 40ed309aca0..243bb46699e 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ "einops", "timm", "scikit-learn", + "sentencepiece", "rjieba", ] diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py index be01a92d447..caca91e27ca 100644 --- a/tests/bettertransformer/test_audio.py +++ b/tests/bettertransformer/test_audio.py @@ -35,7 +35,7 @@ class TestsWhisper(unittest.TestCase): def test_error_message(self): - model = AutoModel.from_pretrained("openai/whisper-tiny") + model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager") with self.assertRaises(ValueError) as cm: model = BetterTransformer.transform(model) @@ -82,15 +82,19 @@ def _test_fp16_inference( set_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -147,7 +151,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int): model_id = MODELS_DICT[model_type] processor = AutoProcessor.from_pretrained(model_id) - model = AutoModel.from_pretrained(model_id) + model = AutoModel.from_pretrained(model_id, attn_implementation="eager") text = ["This is me and me"] if batch_size > 1: @@ -217,14 +221,14 @@ def test_logits(self, model_type: str): inputs = self.prepare_inputs_for_class(model_id, model_type) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config torch.manual_seed(0) converted_model = BetterTransformer.transform(hf_random_model) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config self.assertFalse( diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py index 35b89d2ed2e..b8bc0a3b3d9 100644 --- a/tests/bettertransformer/test_common.py +++ b/tests/bettertransformer/test_common.py @@ -28,7 +28,7 @@ class BetterTransformerIntegrationTests(unittest.TestCase): def test_raise_error_on_double_transform_call(self): - model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel") + model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager") with self.assertRaises(Exception) as cm: bt_model = BetterTransformer.transform(model) @@ -59,7 +59,7 @@ def test_raise_on_save(self, model_type: str): ) for model_id in model_ids: with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_model, keep_original_model=False) bt_model.save_pretrained(tmpdirname) @@ -73,7 +73,7 @@ def test_conversion(self, model_type: str): MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],) ) for model_id in model_ids: - hf_random_model = AutoModel.from_pretrained(model_id) + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") converted_model = BetterTransformer.transform(hf_random_model) self.assertTrue( @@ -99,7 +99,7 @@ def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep ) for model_id in model_ids: # get hf and bt model - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") # get bt model and invert it bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -145,9 +145,11 @@ def test_raise_activation_fun(self, model_type: str): )() # random config class for the model to test hf_random_config.hidden_act = "silu" - hf_random_model = AutoModel.from_config(hf_random_config).eval() + hf_random_model = AutoModel.from_config(hf_random_config, attn_implementation="eager").eval() + with self.assertRaises(ValueError) as cm: _ = BetterTransformer.transform(hf_random_model, keep_original_model=True) + self.assertTrue("Activation function" in str(cm.exception)) def test_dict_class_consistency(self): diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py index bab8f376fcc..f5958ceb1d2 100644 --- a/tests/bettertransformer/test_decoder.py +++ b/tests/bettertransformer/test_decoder.py @@ -131,7 +131,7 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in model_id = MODELS_DICT[model_type] - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) @@ -167,7 +167,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: if tokenizer.eos_token != "": @@ -224,7 +224,7 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina @require_torch_gpu @require_accelerate def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None): - hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager").eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py index 74aacaed58c..7dd42c43b05 100644 --- a/tests/bettertransformer/test_encoder.py +++ b/tests/bettertransformer/test_encoder.py @@ -181,7 +181,9 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m If this works for roberta, it should work for all other models too. """ - hf_model = AutoModel.from_pretrained("xlm-roberta-base", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModel.from_pretrained( + "xlm-roberta-base", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py index 8d05923522a..b64f66fa1a3 100644 --- a/tests/bettertransformer/test_encoder_decoder.py +++ b/tests/bettertransformer/test_encoder_decoder.py @@ -45,7 +45,6 @@ class BetterTransformersEncoderDecoderTest(BetterTransformersTestMixin, unittest "mbart", "pegasus", "prophetnet", - "t5", ] FULL_GRID = { @@ -153,7 +152,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + model = AutoModelForSeq2SeqLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/tests/bettertransformer/test_gpu.py b/tests/bettertransformer/test_gpu.py index b992b90d3c8..ada38e408fa 100644 --- a/tests/bettertransformer/test_gpu.py +++ b/tests/bettertransformer/test_gpu.py @@ -26,7 +26,9 @@ def timing_cuda(model, num_batches, input_ids, masks, decoder_input_ids): def benchmark(model_name: str, num_batches: int, batch_size: int, max_seqlen: int, is_half: bool): - hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16 if is_half else None).eval() + hf_model = AutoModel.from_pretrained( + model_name, torch_dtype=torch.float16 if is_half else None, attn_implementation="eager" + ).eval() hf_model = hf_model.to("cuda:0") bt_model = BetterTransformer.transform(hf_model, keep_original_model=True) diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index 098882180aa..f79cbb34512 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -136,10 +136,12 @@ def _test_fp16_inference( torch.manual_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -169,7 +171,7 @@ def _test_fp16_inference( def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs): inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config # I could not obtain reproducible results with `torch.manual_seed` nor with @@ -309,7 +311,7 @@ def _test_train_decoder(self, model_id: str, model_type: str, **kwargs): """ inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) bt_model.train() @@ -328,7 +330,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): r""" Test that the inverse converted model and hf model have the same modules """ - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_modules = list(hf_model.modules()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -349,7 +351,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): def _test_save_load_invertible(self, model_id, keep_original_model=True): with tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() hf_model_state_dict = copy.deepcopy(hf_model.state_dict()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -362,7 +364,7 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True): # saving a normal transformers bark model fails because of shared tensors bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark") - bt_model_from_load = AutoModel.from_pretrained(tmpdirname) + bt_model_from_load = AutoModel.from_pretrained(tmpdirname, attn_implementation="eager") self.assertEqual( set(bt_model.state_dict().keys()), @@ -397,7 +399,7 @@ def _test_invert_model_logits( """ inputs = self.prepare_inputs_for_class(model_id, model_type=model_type, **preprocessor_kwargs) - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_model = hf_model.eval() with torch.inference_mode(): From 29a5dbc41f5a0b56d43b7b45c6557817e7a58c47 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 23 Oct 2024 17:20:59 +0200 Subject: [PATCH 09/33] fixed t5_forward for real, because it's also used by blip-2 as well --- optimum/bettertransformer/models/attention.py | 326 ++++++++++++------ .../models/decoder_models.py | 4 +- setup.py | 2 +- tests/bettertransformer/test_decoder.py | 4 +- .../bettertransformer/test_encoder_decoder.py | 1 + 5 files changed, 223 insertions(+), 114 deletions(-) diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 22b8faf1c21..c8c91a04e4e 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -387,137 +387,243 @@ def opt_forward( # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward -def t5_forward( - self, - hidden_states, - mask=None, - key_value_states=None, - position_bias=None, - past_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, - output_attentions=False, - **kwargs, -): - raise_on_head_mask(layer_head_mask) +if check_if_transformers_greater("4.45.99"): - if output_attentions is True: - raise ValueError("output_attentions=True can not be supported with BetterTransformer.") - if len(self.pruned_heads) > 0: - raise ValueError(f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}.") - batch_size, seq_length = hidden_states.shape[:2] - - real_seq_length = seq_length - - if past_key_value is not None: - assert ( - len(past_key_value) == 2 - ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) + batch_size, seq_length = hidden_states.shape[:2] + + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None + + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True + + if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, seq_length, key_length), device=query_states.device, dtype=query_states.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=query_states.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] + + if mask is not None: + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias_masked, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) + attn_output = self.o(attn_output) + + outputs = (attn_output, past_key_value, position_bias) + + return outputs + +else: + + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + **kwargs, + ): + raise_on_head_mask(layer_head_mask) + + if output_attentions is True: + raise ValueError("output_attentions=True can not be supported with BetterTransformer.") + if len(self.pruned_heads) > 0: + raise ValueError( + f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}." + ) + + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + assert ( + len(past_key_value) == 2 + ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" if key_value_states is None: # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: # cross-attn # (batch_size, n_heads, seq_length, dim_per_head) hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - # get key/value states - key_states = project( - hidden_states, - self.k, - key_value_states, - past_key_value[0] if past_key_value is not None else None, - ) - value_states = project( - hidden_states, - self.v, - key_value_states, - past_key_value[1] if past_key_value is not None else None, - ) + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + elif past_key_value.shape[2] != key_value_states.shape[1]: + # checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, + self.k, + key_value_states, + past_key_value[0] if past_key_value is not None else None, + ) + value_states = project( + hidden_states, + self.v, + key_value_states, + past_key_value[1] if past_key_value is not None else None, + ) - dropout_p = self.dropout if self.training else 0.0 - query_states = self.scale * query_states - if position_bias is None and not self.has_relative_attention_bias: - if mask is None: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=None, dropout_p=dropout_p, is_causal=False - ) - elif mask is not None: + dropout_p = self.dropout if self.training else 0.0 + query_states = self.scale * query_states + if position_bias is None and not self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=mask, dropout_p=dropout_p, is_causal=False ) - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), - device=value_states.device, - dtype=value_states.dtype, - ) - if self.gradient_checkpointing and self.training: - position_bias.requires_grad = True + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), + device=value_states.device, + dtype=value_states.dtype, + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.has_relative_attention_bias: + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias, + dropout_p=dropout_p, + is_causal=False, + ) else: - position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - - if self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False ) - else: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False - ) - attn_output = unshape(attn_output) # (batch_size, seq_length, dim) - attn_output = self.o(attn_output) + attn_output = unshape(attn_output) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) - return outputs + return outputs # Adapted from transformers.models.bart.modeling_bart.BartAttention.forward diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 52d28d076d3..e8045e695c1 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -327,9 +327,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): setattr(self, "relative_attention_bias", layer.relative_attention_bias) self.original_layers_mapping["relative_attention_bias"] = "relative_attention_bias" - self.module_mapping = None - + self.layer_idx = getattr(layer, "layer_idx", None) self.is_decoder = layer.is_decoder + self.module_mapping = None def forward(self, *args, **kwargs): return t5_forward(self, *args, **kwargs) diff --git a/setup.py b/setup.py index 243bb46699e..62dd6ee8fa5 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - # "transformers[sentencepiece]>=4.26,<4.38", + "transformers>=4.26,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py index f5958ceb1d2..e2bc6ddc2fb 100644 --- a/tests/bettertransformer/test_decoder.py +++ b/tests/bettertransformer/test_decoder.py @@ -224,7 +224,9 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina @require_torch_gpu @require_accelerate def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None): - hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager").eval() + hf_model = AutoModelForCausalLM.from_pretrained( + "gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py index b64f66fa1a3..5ce4d62b12c 100644 --- a/tests/bettertransformer/test_encoder_decoder.py +++ b/tests/bettertransformer/test_encoder_decoder.py @@ -45,6 +45,7 @@ class BetterTransformersEncoderDecoderTest(BetterTransformersTestMixin, unittest "mbart", "pegasus", "prophetnet", + "t5", ] FULL_GRID = { From c860ba964f5a633dda88675001bbc4838e37132c Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 10:44:03 +0200 Subject: [PATCH 10/33] fix Phi3 --- optimum/exporters/onnx/model_configs.py | 10 ++- optimum/exporters/onnx/model_patcher.py | 89 +++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 5d7c29b4f45..ed270660bbd 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -76,6 +76,7 @@ FalconModelPatcher, MistralModelPatcher, MusicgenModelPatcher, + Phi3ModelPatcher, SAMModelPatcher, SentenceTransformersCLIPPatcher, SentenceTransformersTransformerPatcher, @@ -304,6 +305,11 @@ class Phi3OnnxConfig(PhiOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return Phi3ModelPatcher(self, model, model_kwargs=model_kwargs) + class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 @@ -2156,8 +2162,8 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast): DummySeq2SeqPastKeyValuesGenerator, DummyPix2StructInputGenerator, ) - # Min operator needs to support int64, which is the case for opset>=12 - DEFAULT_ONNX_OPSET = 12 + + DEFAULT_ONNX_OPSET = 14 # use 'aten::triu' now which is opset 14 @property def inputs(self): diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 34ed5fcae46..cad4a418808 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -1155,3 +1155,92 @@ def __exit__(self, exc_type, exc_value, traceback): from transformers.models.clip.modeling_clip import CLIPSdpaAttention CLIPSdpaAttention.forward = self.original_sdpa_forward + + +# Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Phi3 +def _prepare_4d_causal_attention_mask_with_cache_position_patched( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + config: Any, + past_key_values: Any, +): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + config (`Phi3Config`): + The model's configuration class + past_key_values (`Cache`): + The cache class that is being used currently to generate + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + if config.sliding_window is not None: + # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also + # the check is needed to verify is current checkpoint was trained with sliding window or not + if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: + sliding_attend_mask = torch.arange(target_length, device=device) <= ( + cache_position.reshape(-1, 1) - config.sliding_window + ) + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) + causal_mask *= diagonal_attend_mask + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.shape[-1] > target_length: + attention_mask = attention_mask[:, :target_length] + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + return causal_mask + + +from transformers import Phi3ForCausalLM + + +class Phi3ModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Phi3ForCausalLM, + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + if _transformers_version >= version.parse("4.46.0"): + if hasattr(self._model, "model"): + self._model.model._prepare_4d_causal_attention_mask_with_cache_position = ( + _prepare_4d_causal_attention_mask_with_cache_position_patched + ) + else: + self._model._prepare_4d_causal_attention_mask_with_cache_position = ( + _prepare_4d_causal_attention_mask_with_cache_position_patched + ) From 619161716ef328e7b4e18e50f7500b1e7afae286 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 10:45:10 +0200 Subject: [PATCH 11/33] fix opt --- optimum/exporters/onnx/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index ed270660bbd..e1fa378ad2d 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -269,7 +269,7 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): class OPTOnnxConfig(TextDecoderOnnxConfig): # OPT does not require position_ids input. - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. NORMALIZED_CONFIG_CLASS = NormalizedTextConfig From 79ed33589e78f871e1b8680fa75974ee63a385fa Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 10:48:47 +0200 Subject: [PATCH 12/33] vision encoder decoder --- optimum/exporters/onnx/model_configs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e1fa378ad2d..7bea661cd9b 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -2033,6 +2033,7 @@ class TrOCROnnxConfig(TextSeq2SeqOnnxConfig): class VisionEncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig ATOL_FOR_VALIDATION = 1e-3 + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator) From 72ca53c0ce1e2a934a14f8c1b8a154e12296a5d7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 10:51:40 +0200 Subject: [PATCH 13/33] fix setup --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 62dd6ee8fa5..8c401df4ce3 100644 --- a/setup.py +++ b/setup.py @@ -15,8 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers @ git+https://github.com/huggingface/transformers.git", - # "transformers[sentencepiece]>=4.29", + "transformers>=4.29", "torch>=1.11", "packaging", "numpy", From d1236ddde64576024996abfbe07073519d2d99ef Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 10:58:12 +0200 Subject: [PATCH 14/33] style --- optimum/exporters/onnx/model_patcher.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index cad4a418808..9b8119dc2f4 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -1223,14 +1223,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position_patched( return causal_mask -from transformers import Phi3ForCausalLM - - class Phi3ModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", - model: Phi3ForCausalLM, + model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None, ): super().__init__(config, model, model_kwargs) From f98a605fef1a575a34786350c6de32d6969cc73b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 12:09:25 +0200 Subject: [PATCH 15/33] fix encoder decoder --- optimum/exporters/onnx/model_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 7bea661cd9b..714e3c35755 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -2317,3 +2317,5 @@ def overwrite_shape_and_generate_input( class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig + + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. From d1f48702bd0d2036087509ae848288dcca928e32 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 14:49:46 +0200 Subject: [PATCH 16/33] fixed transformers branch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8c401df4ce3..058cf6921f3 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers>=4.29", + "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", "torch>=1.11", "packaging", "numpy", From 970235a83801bb901e5cea96cfc28c05878f5b61 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 15:02:40 +0200 Subject: [PATCH 17/33] branch --- setup.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 058cf6921f3..9dad6f71427 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + "transformers>=4.29,<4.47", "torch>=1.11", "packaging", "numpy", @@ -54,7 +54,8 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.47.0", + # "transformers<4.47.0", + "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", ], "onnxruntime-gpu": [ "onnx", @@ -63,10 +64,23 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.47.0", + # "transformers<4.47.0", + "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + ], + "exporters": [ + "onnx", + "onnxruntime", + "timm", + # "transformers<4.47.0", + "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + ], + "exporters-gpu": [ + "onnx", + "onnxruntime-gpu", + "timm", + # "transformers<4.47.0", + "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.47.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.47.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", From 27a2934e62e5f2486ea1a2c181f63d75e635171b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Oct 2024 15:18:21 +0200 Subject: [PATCH 18/33] allow 4.47 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9dad6f71427..5444930ab51 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers>=4.29,<4.47", + "transformers>=4.29", "torch>=1.11", "packaging", "numpy", From 4e9b3b5c105d972dae62538dbb157e2652b6308a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 25 Oct 2024 17:26:02 +0200 Subject: [PATCH 19/33] remove patch --- .github/workflows/test_cli.yml | 2 +- optimum/exporters/onnx/model_configs.py | 6 -- optimum/exporters/onnx/model_patcher.py | 89 +------------------------ 3 files changed, 2 insertions(+), 95 deletions(-) diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index be000caf6a3..2efab40aab6 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -34,7 +34,7 @@ jobs: run: | pip install --upgrade pip pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests,exporters,exporters-tf] + pip install .[tests,exporters-tf] - name: Test with pytest run: | diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 714e3c35755..c97cd196ffe 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -76,7 +76,6 @@ FalconModelPatcher, MistralModelPatcher, MusicgenModelPatcher, - Phi3ModelPatcher, SAMModelPatcher, SentenceTransformersCLIPPatcher, SentenceTransformersTransformerPatcher, @@ -305,11 +304,6 @@ class Phi3OnnxConfig(PhiOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return Phi3ModelPatcher(self, model, model_kwargs=model_kwargs) - class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 9b8119dc2f4..fdfb0e280f5 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -34,11 +34,10 @@ if _transformers_version > version.parse("4.34.99"): - from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask + from transformers.modeling_attn_mask_utils import AttentionMaskConverter if _transformers_version >= version.parse("4.36"): from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa else: - _prepare_4d_causal_attention_mask = None _prepare_4d_causal_attention_mask_for_sdpa = None AttentionMaskConverter = None @@ -1155,89 +1154,3 @@ def __exit__(self, exc_type, exc_value, traceback): from transformers.models.clip.modeling_clip import CLIPSdpaAttention CLIPSdpaAttention.forward = self.original_sdpa_forward - - -# Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Phi3 -def _prepare_4d_causal_attention_mask_with_cache_position_patched( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - cache_position: torch.Tensor, - batch_size: int, - config: Any, - past_key_values: Any, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - config (`Phi3Config`): - The model's configuration class - past_key_values (`Cache`): - The cache class that is being used currently to generate - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - min_dtype = torch.finfo(dtype).min - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - if config.sliding_window is not None: - # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also - # the check is needed to verify is current checkpoint was trained with sliding window or not - if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: - sliding_attend_mask = torch.arange(target_length, device=device) <= ( - cache_position.reshape(-1, 1) - config.sliding_window - ) - diagonal_attend_mask.bitwise_or_(sliding_attend_mask) - causal_mask *= diagonal_attend_mask - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - if attention_mask.shape[-1] > target_length: - attention_mask = attention_mask[:, :target_length] - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - return causal_mask - - -class Phi3ModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - if _transformers_version >= version.parse("4.46.0"): - if hasattr(self._model, "model"): - self._model.model._prepare_4d_causal_attention_mask_with_cache_position = ( - _prepare_4d_causal_attention_mask_with_cache_position_patched - ) - else: - self._model._prepare_4d_causal_attention_mask_with_cache_position = ( - _prepare_4d_causal_attention_mask_with_cache_position_patched - ) From 5ef5c4813e7c051a29067913828f91de590f1730 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 25 Oct 2024 18:17:23 +0200 Subject: [PATCH 20/33] add opt --- tests/onnxruntime/test_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 33243da278a..74ea8fc297d 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2314,6 +2314,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "llama", "mistral", "mpt", + "opt", "phi3", "qwen2", ] From 41e6be5c93bc87478f996dfa68fcba8102f90e5b Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 25 Oct 2024 18:19:45 +0200 Subject: [PATCH 21/33] add test --- tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 5071d0081af..e3d54237857 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -125,6 +125,7 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "lewtun/tiny-random-mt5", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver", "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv", From 4b16e7c41c9be73317ba3ecbdce49ec93f42489f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 16:52:37 +0100 Subject: [PATCH 22/33] fix OPT ONNX export and inference --- optimum/exporters/onnx/model_configs.py | 14 ++++++++++---- optimum/exporters/onnx/utils.py | 6 +++++- optimum/onnxruntime/modeling_decoder.py | 4 ++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index c97cd196ffe..0988b68aaf0 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -266,10 +266,16 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig -class OPTOnnxConfig(TextDecoderOnnxConfig): - # OPT does not require position_ids input. - DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + +# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46 +if check_if_transformers_greater("4.45.99"): + class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +else: + class OPTOnnxConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 675566ba23e..56249bbf5c3 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -27,7 +27,7 @@ is_diffusers_available, logging, ) -from ...utils.import_utils import _diffusers_version +from ...utils.import_utils import _diffusers_version, check_if_transformers_greater from ..utils import ( _get_submodels_and_export_configs, ) @@ -89,6 +89,10 @@ } +if check_if_transformers_greater("4.45.99"): + MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt") + + def check_onnxruntime_requirements(minimum_version: version.Version): """ Checks that ONNX Runtime is installed and if version is recent enough. diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index bda3ec98d9a..984d7f22ebf 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -582,7 +582,8 @@ def _from_pretrained( init_cls = ORTFalconForCausalLM elif config.model_type == "mpt": init_cls = ORTMPTForCausalLM - elif config.model_type == "opt": + # if model was exported with position_ids it means the model was exported with transformers >= v4.46 + elif config.model_type == "opt" and "position_ids" not in input_dims: init_cls = ORTOPTForCausalLM elif config.model_type == "gpt_bigcode": init_cls = ORTGPTBigCodeForCausalLM @@ -839,7 +840,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) - return { "input_ids": input_ids, "past_key_values": past_key_values, From 49e911fe4fc08fc08cad1aa06959f062aa67463f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 16:52:44 +0100 Subject: [PATCH 23/33] add test --- tests/onnxruntime/test_modeling.py | 58 +++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 74ea8fc297d..9e6ed57267a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -57,6 +57,7 @@ MBartForConditionalGeneration, Pix2StructForConditionalGeneration, # Pix2Struct does not work with AutoModel PretrainedConfig, + GenerationConfig, set_seed, ) from transformers.modeling_outputs import ImageSuperResolutionOutput @@ -2463,28 +2464,59 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers new_tokens = 5 - onnx_outputs = onnx_model.generate( - **tokens, - num_beams=num_beams, - do_sample=False, - min_new_tokens=new_tokens, + gen_config = GenerationConfig( max_new_tokens=new_tokens, - eos_token_id=None, - ) - - transformers_outputs = transformers_model.generate( - **tokens, + min_new_tokens=new_tokens, num_beams=num_beams, do_sample=False, - min_new_tokens=new_tokens, - max_new_tokens=new_tokens, eos_token_id=None, ) - + onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs)) gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_beam_search(self, model_arch): + model_id = MODEL_NAMES[model_arch] + tokenizer = AutoTokenizer.from_pretrained(model_id) + + gen_kwargs = {"max_new_tokens": 5, "min_new_tokens":5, "eos_token_id":None, "num_beams":4} + beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) + beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) + group_beam_search_gen_config = GenerationConfig(do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs) + + force_words_ids = [tokenizer(["cat"], add_special_tokens=False).input_ids] + constrained_beam_search_gen_config = GenerationConfig(do_sample=False, force_words_ids=force_words_ids, **gen_kwargs) + + gen_configs = ( + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + constrained_beam_search_gen_config, + ) + set_seed(SEED) + onnx_model = ORTModelForCausalLM.from_pretrained(model_id, export=True) + set_seed(SEED) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + + tokenizer.pad_token_id = tokenizer.eos_token_id + tokens = tokenizer(["Input text", "Hello my name is James and"], return_tensors="pt", padding=True) + tokens.pop("token_type_ids", None) + + for gen_config in gen_configs: + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + set_seed(SEED) + onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) + + self.assertTrue( + torch.equal(onnx_outputs, transformers_outputs), + f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}", + ) + @parameterized.expand(grid_parameters(FULL_GRID)) def test_pipeline_ort_model(self, test_name: str, model_arch: str, use_cache: bool): use_io_binding = None From 671293ec574a42f31f8faf38ae04f07ebf51555a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 16:52:50 +0100 Subject: [PATCH 24/33] update setup --- setup.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 5444930ab51..0c23d55af0b 100644 --- a/setup.py +++ b/setup.py @@ -54,8 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - # "transformers<4.47.0", - "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + "transformers<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -64,22 +63,19 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - # "transformers<4.47.0", - "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + "transformers<4.47.0", ], "exporters": [ "onnx", "onnxruntime", "timm", - # "transformers<4.47.0", - "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + "transformers<4.47.0", ], "exporters-gpu": [ "onnx", "onnxruntime-gpu", "timm", - # "transformers<4.47.0", - "transformers@git+https://github.com/huggingface/transformers.git@fix-pix2struct", + "transformers<4.47.0", ], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", From a9856170e3448e486c8499524548c84aff7944df Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 16:53:19 +0100 Subject: [PATCH 25/33] style --- optimum/exporters/onnx/model_configs.py | 4 +++- tests/onnxruntime/test_modeling.py | 10 +++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 0988b68aaf0..6cf1060168d 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -266,13 +266,15 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - # OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46 if check_if_transformers_greater("4.45.99"): + class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + else: + class OPTOnnxConfig(TextDecoderOnnxConfig): DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 9e6ed57267a..756f8a3659a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2482,13 +2482,17 @@ def test_beam_search(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) - gen_kwargs = {"max_new_tokens": 5, "min_new_tokens":5, "eos_token_id":None, "num_beams":4} + gen_kwargs = {"max_new_tokens": 5, "min_new_tokens": 5, "eos_token_id": None, "num_beams": 4} beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) - group_beam_search_gen_config = GenerationConfig(do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs) + group_beam_search_gen_config = GenerationConfig( + do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs + ) force_words_ids = [tokenizer(["cat"], add_special_tokens=False).input_ids] - constrained_beam_search_gen_config = GenerationConfig(do_sample=False, force_words_ids=force_words_ids, **gen_kwargs) + constrained_beam_search_gen_config = GenerationConfig( + do_sample=False, force_words_ids=force_words_ids, **gen_kwargs + ) gen_configs = ( beam_search_gen_config, From b86eaf4a9697e468adf4e724a5bcf03d78ffee0c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 17:00:34 +0100 Subject: [PATCH 26/33] merge tests --- tests/onnxruntime/test_modeling.py | 61 ++++++++++-------------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 756f8a3659a..0e3d2359aca 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2464,50 +2464,27 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers new_tokens = 5 - gen_config = GenerationConfig( - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, - num_beams=num_beams, - do_sample=False, - eos_token_id=None, - ) - onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) - transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) - self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs)) - - gc.collect() - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_beam_search(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) + gen_kwargs = { + "max_new_tokens": new_tokens, + "min_new_tokens": new_tokens, + "eos_token_id": None, + "num_beams": num_beams, + } - gen_kwargs = {"max_new_tokens": 5, "min_new_tokens": 5, "eos_token_id": None, "num_beams": 4} beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) - beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) - group_beam_search_gen_config = GenerationConfig( - do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs - ) - force_words_ids = [tokenizer(["cat"], add_special_tokens=False).input_ids] - constrained_beam_search_gen_config = GenerationConfig( - do_sample=False, force_words_ids=force_words_ids, **gen_kwargs - ) - - gen_configs = ( - beam_search_gen_config, - beam_sample_gen_config, - group_beam_search_gen_config, - constrained_beam_search_gen_config, - ) - set_seed(SEED) - onnx_model = ORTModelForCausalLM.from_pretrained(model_id, export=True) - set_seed(SEED) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) - - tokenizer.pad_token_id = tokenizer.eos_token_id - tokens = tokenizer(["Input text", "Hello my name is James and"], return_tensors="pt", padding=True) - tokens.pop("token_type_ids", None) + if use_cache and num_beams == 3: + beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) + group_beam_search_gen_config = GenerationConfig( + do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs + ) + gen_configs = ( + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + ) + else: + gen_configs = (beam_search_gen_config,) for gen_config in gen_configs: set_seed(SEED) @@ -2521,6 +2498,8 @@ def test_beam_search(self, model_arch): f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}", ) + gc.collect() + @parameterized.expand(grid_parameters(FULL_GRID)) def test_pipeline_ort_model(self, test_name: str, model_arch: str, use_cache: bool): use_io_binding = None From 6ebf667c922305e39878e0a82ca3f02ea01d16ad Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 17:19:06 +0100 Subject: [PATCH 27/33] update tes num beams --- tests/onnxruntime/test_modeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 0e3d2359aca..c7068419088 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -54,10 +54,10 @@ AutoModelForTokenClassification, AutoModelForVision2Seq, AutoTokenizer, + GenerationConfig, MBartForConditionalGeneration, Pix2StructForConditionalGeneration, # Pix2Struct does not work with AutoModel PretrainedConfig, - GenerationConfig, set_seed, ) from transformers.modeling_outputs import ImageSuperResolutionOutput @@ -2401,7 +2401,7 @@ def test_merge_from_onnx_and_save(self, model_arch): self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents) self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents) - @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 3]})) + @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 4]})) def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, num_beams: int): use_io_binding = None if use_cache is False: @@ -2473,7 +2473,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) - if use_cache and num_beams == 3: + if use_cache and num_beams == 4: beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) group_beam_search_gen_config = GenerationConfig( do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs From 1ce1a0f4343de0c16de7e4b7daceb1684ee741a7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 28 Oct 2024 17:30:52 +0100 Subject: [PATCH 28/33] add test transformers version --- .github/workflows/test_onnxruntime.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 7bbe00bd157..6ef6c1edc71 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -17,8 +17,13 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + transformers-version: ["latest"] os: [ubuntu-20.04, windows-2019, macos-13] + include: + - transformers-version: "4.36.*" + os: ubuntu-20.04 + - transformers-version: "4.45.*" + os: ubuntu-20.04 runs-on: ${{ matrix.os }} steps: @@ -33,10 +38,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} + - name: Setup Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 - name: Install dependencies run: | @@ -44,6 +49,10 @@ jobs: pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[tests,onnxruntime] + - name: Install transformers ${{ matrix.transformers-version }} + if: ${{ matrix.transformers-version != 'latest' }} + run: pip install transformers==${{ matrix.transformers-version }} + - name: Test with pytest (in series) working-directory: tests run: | From 641d3516a050ca70231c22b09ba778d99fbb73c7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 29 Oct 2024 10:24:58 +0100 Subject: [PATCH 29/33] add architectures depending on transformers --- tests/onnxruntime/test_modeling.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index c7068419088..fa5e7353aa2 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -107,7 +107,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.import_utils import is_diffusers_available +from optimum.utils.import_utils import is_diffusers_available, check_if_transformers_greater from optimum.utils.testing_utils import ( grid_parameters, remove_directory, @@ -2316,10 +2316,12 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "mistral", "mpt", "opt", - "phi3", - "qwen2", ] + + if check_if_transformers_greater("4.40"): + SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [False, True], From d1725f7b90962a0e44bc5b53877d9953eb7a498c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 29 Oct 2024 10:28:16 +0100 Subject: [PATCH 30/33] add warning --- optimum/exporters/onnx/model_configs.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 6cf1060168d..8893e272286 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -312,6 +312,15 @@ class Phi3OnnxConfig(PhiOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") + def __init__(self, *args, **kwargs): + # TODO : replace check_if_transformers_greater with is_transformers_available + if check_if_transformers_greater("4.46.0") and not check_if_transformers_greater("4.46.1"): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Phi3 model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) + super().__init__(*args, **kwargs) + class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 @@ -2168,6 +2177,19 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast): DEFAULT_ONNX_OPSET = 14 # use 'aten::triu' now which is opset 14 + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO : replace check_if_transformers_greater with is_transformers_available + if ( + check_if_transformers_greater("4.46.0") + and not check_if_transformers_greater("4.46.1") + and self._behavior is ConfigBehavior.DECODER + ): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) + @property def inputs(self): common_inputs = {} From d3c3e7aecf203f9c9f9dc45e0786ee3f7e0c248a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 29 Oct 2024 10:30:12 +0100 Subject: [PATCH 31/33] revert --- .github/workflows/test_onnxruntime.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 6ef6c1edc71..0ab95752d01 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -20,8 +20,6 @@ jobs: transformers-version: ["latest"] os: [ubuntu-20.04, windows-2019, macos-13] include: - - transformers-version: "4.36.*" - os: ubuntu-20.04 - transformers-version: "4.45.*" os: ubuntu-20.04 From fac176c1ddc1fecfc3894850a58d1087c6d0cfbf Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 29 Oct 2024 10:43:01 +0100 Subject: [PATCH 32/33] update test generation length --- tests/onnxruntime/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index fa5e7353aa2..a5f6228b93d 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2330,7 +2330,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): ORTMODEL_CLASS = ORTModelForCausalLM TASK = "text-generation" - GENERATION_LENGTH = 100 + GENERATION_LENGTH = 90 SPEEDUP_CACHE = 1.1 @parameterized.expand([(False,), (True,)]) From 0cec49b5ba7ce2fdc715dc7e2aae86953abeac3e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 29 Oct 2024 10:52:20 +0100 Subject: [PATCH 33/33] style --- optimum/exporters/onnx/model_configs.py | 2 +- tests/onnxruntime/test_modeling.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 8893e272286..9e57128c272 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -2187,7 +2187,7 @@ def __init__(self, *args, **kwargs): ): logger.error( "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. " - "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" ) @property diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index a5f6228b93d..bcf3762b39c 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -107,7 +107,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.import_utils import is_diffusers_available, check_if_transformers_greater +from optimum.utils.import_utils import check_if_transformers_greater, is_diffusers_available from optimum.utils.testing_utils import ( grid_parameters, remove_directory, @@ -2318,7 +2318,6 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "opt", ] - if check_if_transformers_greater("4.40"): SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"])